1 /*
2 Copyright (c) 2020 Dario Mambro ( dario.mambro@gmail.com )
3 */
4
5 /* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
6
7 Redistribution and use of the Software in source and binary forms,
8 with or without modification, is permitted provided that the
9 following conditions are met:
10
11 - Neither the names of NCAR's Computational and Information Systems
12 Laboratory, the University Corporation for Atmospheric Research,
13 nor the names of its sponsors or contributors may be used to
14 endorse or promote products derived from this Software without
15 specific prior written permission.
16
17 - Redistributions of source code must retain the above copyright
18 notices, this list of conditions, and the disclaimer below.
19
20 - Redistributions in binary form must reproduce the above copyright
21 notice, this list of conditions, and the disclaimer below in the
22 documentation and/or other materials provided with the
23 distribution.
24
25 THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26 EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
27 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28 NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
29 HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
30 EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
31 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
32 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
33 SOFTWARE.
34 */
35
36 #ifndef PF_NEON_DBL_H
37 #define PF_NEON_DBL_H
38
39 /*
40 NEON 64bit support macros
41 */
42 #if !defined(PFFFT_SIMD_DISABLE) && defined(PFFFT_ENABLE_NEON) && (defined(__aarch64__) || defined(__arm64__))
43
44 #pragma message (__FILE__ ": NEON (from AVX) macros are defined" )
45
46 #include "pf_neon_double_from_avx.h"
47 typedef __m256d v4sf;
48
49 /* 4 doubles by simd vector */
50 # define SIMD_SZ 4
51
52 typedef union v4sf_union {
53 v4sf v;
54 double f[SIMD_SZ];
55 } v4sf_union;
56
57 # define VARCH "NEON"
58 # define VREQUIRES_ALIGN 1
59 # define VZERO() _mm256_setzero_pd()
60 # define VMUL(a,b) _mm256_mul_pd(a,b)
61 # define VADD(a,b) _mm256_add_pd(a,b)
62 # define VMADD(a,b,c) _mm256_add_pd(_mm256_mul_pd(a,b), c)
63 # define VSUB(a,b) _mm256_sub_pd(a,b)
64 # define LD_PS1(p) _mm256_set1_pd(p)
65 # define VLOAD_UNALIGNED(ptr) _mm256_loadu_pd(ptr)
66 # define VLOAD_ALIGNED(ptr) _mm256_load_pd(ptr)
67
_mm256_insertf128_pd_1(__m256d a,__m128d b)68 FORCE_INLINE __m256d _mm256_insertf128_pd_1(__m256d a, __m128d b)
69 {
70 __m256d res;
71 res.vect_f64[0] = a.vect_f64[0];
72 res.vect_f64[1] = b;
73 return res;
74 }
75
_mm_shuffle_pd_00(__m128d a,__m128d b)76 FORCE_INLINE __m128d _mm_shuffle_pd_00(__m128d a, __m128d b)
77 {
78 float64x1_t al = vget_low_f64(a);
79 float64x1_t bl = vget_low_f64(b);
80 return vcombine_f64(al, bl);
81 }
82
_mm_shuffle_pd_11(__m128d a,__m128d b)83 FORCE_INLINE __m128d _mm_shuffle_pd_11(__m128d a, __m128d b)
84 {
85 float64x1_t ah = vget_high_f64(a);
86 float64x1_t bh = vget_high_f64(b);
87 return vcombine_f64(ah, bh);
88 }
89
_mm256_shuffle_pd_00(__m256d a,__m256d b)90 FORCE_INLINE __m256d _mm256_shuffle_pd_00(__m256d a, __m256d b)
91 {
92 __m256d res;
93 res.vect_f64[0] = _mm_shuffle_pd_00(a.vect_f64[0],b.vect_f64[0]);
94 res.vect_f64[1] = _mm_shuffle_pd_00(a.vect_f64[1],b.vect_f64[1]);
95 return res;
96 }
97
_mm256_shuffle_pd_11(__m256d a,__m256d b)98 FORCE_INLINE __m256d _mm256_shuffle_pd_11(__m256d a, __m256d b)
99 {
100 __m256d res;
101 res.vect_f64[0] = _mm_shuffle_pd_11(a.vect_f64[0],b.vect_f64[0]);
102 res.vect_f64[1] = _mm_shuffle_pd_11(a.vect_f64[1],b.vect_f64[1]);
103 return res;
104 }
105
_mm256_permute2f128_pd_0x20(__m256d a,__m256d b)106 FORCE_INLINE __m256d _mm256_permute2f128_pd_0x20(__m256d a, __m256d b) {
107 __m256d res;
108 res.vect_f64[0] = a.vect_f64[0];
109 res.vect_f64[1] = b.vect_f64[0];
110 return res;
111 }
112
113
_mm256_permute2f128_pd_0x31(__m256d a,__m256d b)114 FORCE_INLINE __m256d _mm256_permute2f128_pd_0x31(__m256d a, __m256d b)
115 {
116 __m256d res;
117 res.vect_f64[0] = a.vect_f64[1];
118 res.vect_f64[1] = b.vect_f64[1];
119 return res;
120 }
121
_mm256_reverse(__m256d x)122 FORCE_INLINE __m256d _mm256_reverse(__m256d x)
123 {
124 __m256d res;
125 float64x2_t low = x.vect_f64[0];
126 float64x2_t high = x.vect_f64[1];
127 float64x1_t a = vget_low_f64(low);
128 float64x1_t b = vget_high_f64(low);
129 float64x1_t c = vget_low_f64(high);
130 float64x1_t d = vget_high_f64(high);
131 res.vect_f64[0] = vcombine_f64(d, c);
132 res.vect_f64[1] = vcombine_f64(b, a);
133 return res;
134 }
135
136 /* INTERLEAVE2 (in1, in2, out1, out2) pseudo code:
137 out1 = [ in1[0], in2[0], in1[1], in2[1] ]
138 out2 = [ in1[2], in2[2], in1[3], in2[3] ]
139 */
140 # define INTERLEAVE2(in1, in2, out1, out2) { \
141 __m128d low1__ = _mm256_castpd256_pd128(in1); \
142 __m128d low2__ = _mm256_castpd256_pd128(in2); \
143 __m128d high1__ = _mm256_extractf128_pd(in1, 1); \
144 __m128d high2__ = _mm256_extractf128_pd(in2, 1); \
145 __m256d tmp__ = _mm256_insertf128_pd_1( \
146 _mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, low2__)), \
147 _mm_shuffle_pd_11(low1__, low2__)); \
148 out2 = _mm256_insertf128_pd_1( \
149 _mm256_castpd128_pd256(_mm_shuffle_pd_00(high1__, high2__)), \
150 _mm_shuffle_pd_11(high1__, high2__)); \
151 out1 = tmp__; \
152 }
153
154 /*UNINTERLEAVE2(in1, in2, out1, out2) pseudo code:
155 out1 = [ in1[0], in1[2], in2[0], in2[2] ]
156 out2 = [ in1[1], in1[3], in2[1], in2[3] ]
157 */
158 # define UNINTERLEAVE2(in1, in2, out1, out2) { \
159 __m128d low1__ = _mm256_castpd256_pd128(in1); \
160 __m128d low2__ = _mm256_castpd256_pd128(in2); \
161 __m128d high1__ = _mm256_extractf128_pd(in1, 1); \
162 __m128d high2__ = _mm256_extractf128_pd(in2, 1); \
163 __m256d tmp__ = _mm256_insertf128_pd_1( \
164 _mm256_castpd128_pd256(_mm_shuffle_pd_00(low1__, high1__)), \
165 _mm_shuffle_pd_00(low2__, high2__)); \
166 out2 = _mm256_insertf128_pd_1( \
167 _mm256_castpd128_pd256(_mm_shuffle_pd_11(low1__, high1__)), \
168 _mm_shuffle_pd_11(low2__, high2__)); \
169 out1 = tmp__; \
170 }
171
172 # define VTRANSPOSE4(row0, row1, row2, row3) { \
173 __m256d tmp3, tmp2, tmp1, tmp0; \
174 \
175 tmp0 = _mm256_shuffle_pd_00((row0),(row1)); \
176 tmp2 = _mm256_shuffle_pd_11((row0),(row1)); \
177 tmp1 = _mm256_shuffle_pd_00((row2),(row3)); \
178 tmp3 = _mm256_shuffle_pd_11((row2),(row3)); \
179 \
180 (row0) = _mm256_permute2f128_pd_0x20(tmp0, tmp1); \
181 (row1) = _mm256_permute2f128_pd_0x20(tmp2, tmp3); \
182 (row2) = _mm256_permute2f128_pd_0x31(tmp0, tmp1); \
183 (row3) = _mm256_permute2f128_pd_0x31(tmp2, tmp3); \
184 }
185
186 /*VSWAPHL(a, b) pseudo code:
187 return [ b[0], b[1], a[2], a[3] ]
188 */
189 # define VSWAPHL(a,b) \
190 _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_castpd256_pd128(b)), _mm256_extractf128_pd(a, 1))
191
192 /* reverse/flip all floats */
193 # define VREV_S(a) _mm256_reverse(a)
194
195 /* reverse/flip complex floats */
196 # define VREV_C(a) _mm256_insertf128_pd_1(_mm256_castpd128_pd256(_mm256_extractf128_pd(a, 1)), _mm256_castpd256_pd128(a))
197
198 # define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x1F) == 0)
199
200 #endif
201
202 #endif /* PF_AVX_DBL_H */
203
204