1 /*
2 * Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
3
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7
8 * http://www.apache.org/licenses/LICENSE-2.0
9
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15
16 */
17
18 //see https://github.com/kunpengcompute/AvxToNeon
19
20 #ifndef PF_NEON_DBL_FROM_AVX_H
21 #define PF_NEON_DBL_FROM_AVX_H
22 #include <arm_neon.h>
23
24
25 #if defined(__GNUC__) || defined(__clang__)
26
27 #pragma push_macro("FORCE_INLINE")
28 #define FORCE_INLINE static inline __attribute__((always_inline))
29
30 #else
31
32 #error "Macro name collisions may happens with unknown compiler"
33 #ifdef FORCE_INLINE
34 #undef FORCE_INLINE
35 #endif
36
37 #define FORCE_INLINE static inline
38
39 #endif
40
41 typedef struct {
42 float32x4_t vect_f32[2];
43 } __m256;
44
45 typedef struct {
46 float64x2_t vect_f64[2];
47 } __m256d;
48
49 typedef float64x2_t __m128d;
50
_mm256_setzero_pd(void)51 FORCE_INLINE __m256d _mm256_setzero_pd(void)
52 {
53 __m256d ret;
54 ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(0.0);
55 return ret;
56 }
57
_mm256_mul_pd(__m256d a,__m256d b)58 FORCE_INLINE __m256d _mm256_mul_pd(__m256d a, __m256d b)
59 {
60 __m256d res_m256d;
61 res_m256d.vect_f64[0] = vmulq_f64(a.vect_f64[0], b.vect_f64[0]);
62 res_m256d.vect_f64[1] = vmulq_f64(a.vect_f64[1], b.vect_f64[1]);
63 return res_m256d;
64 }
65
_mm256_add_pd(__m256d a,__m256d b)66 FORCE_INLINE __m256d _mm256_add_pd(__m256d a, __m256d b)
67 {
68 __m256d res_m256d;
69 res_m256d.vect_f64[0] = vaddq_f64(a.vect_f64[0], b.vect_f64[0]);
70 res_m256d.vect_f64[1] = vaddq_f64(a.vect_f64[1], b.vect_f64[1]);
71 return res_m256d;
72 }
73
_mm256_sub_pd(__m256d a,__m256d b)74 FORCE_INLINE __m256d _mm256_sub_pd(__m256d a, __m256d b)
75 {
76 __m256d res_m256d;
77 res_m256d.vect_f64[0] = vsubq_f64(a.vect_f64[0], b.vect_f64[0]);
78 res_m256d.vect_f64[1] = vsubq_f64(a.vect_f64[1], b.vect_f64[1]);
79 return res_m256d;
80 }
81
_mm256_set1_pd(double a)82 FORCE_INLINE __m256d _mm256_set1_pd(double a)
83 {
84 __m256d ret;
85 ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(a);
86 return ret;
87 }
88
_mm256_load_pd(double const * mem_addr)89 FORCE_INLINE __m256d _mm256_load_pd (double const * mem_addr)
90 {
91 __m256d res;
92 res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
93 res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
94 return res;
95 }
_mm256_loadu_pd(double const * mem_addr)96 FORCE_INLINE __m256d _mm256_loadu_pd (double const * mem_addr)
97 {
98 __m256d res;
99 res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
100 res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
101 return res;
102 }
103
_mm256_castpd256_pd128(__m256d a)104 FORCE_INLINE __m128d _mm256_castpd256_pd128(__m256d a)
105 {
106 return a.vect_f64[0];
107 }
108
_mm256_extractf128_pd(__m256d a,const int imm8)109 FORCE_INLINE __m128d _mm256_extractf128_pd (__m256d a, const int imm8)
110 {
111 assert(imm8 >= 0 && imm8 <= 1);
112 return a.vect_f64[imm8];
113 }
114
_mm256_castpd128_pd256(__m128d a)115 FORCE_INLINE __m256d _mm256_castpd128_pd256(__m128d a)
116 {
117 __m256d res;
118 res.vect_f64[0] = a;
119 return res;
120 }
121
122 #endif /* PF_AVX_DBL_H */
123
124