• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved.
3 
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7 
8  * http://www.apache.org/licenses/LICENSE-2.0
9 
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15 
16  */
17 
18 //see https://github.com/kunpengcompute/AvxToNeon
19 
20 #ifndef PF_NEON_DBL_FROM_AVX_H
21 #define PF_NEON_DBL_FROM_AVX_H
22 #include <arm_neon.h>
23 
24 
25 #if defined(__GNUC__) || defined(__clang__)
26 
27 #pragma push_macro("FORCE_INLINE")
28 #define FORCE_INLINE static inline __attribute__((always_inline))
29 
30 #else
31 
32 #error "Macro name collisions may happens with unknown compiler"
33 #ifdef FORCE_INLINE
34 #undef FORCE_INLINE
35 #endif
36 
37 #define FORCE_INLINE static inline
38 
39 #endif
40 
41 typedef struct {
42     float32x4_t vect_f32[2];
43 } __m256;
44 
45 typedef struct {
46     float64x2_t vect_f64[2];
47 } __m256d;
48 
49 typedef float64x2_t __m128d;
50 
_mm256_setzero_pd(void)51 FORCE_INLINE __m256d _mm256_setzero_pd(void)
52 {
53     __m256d ret;
54     ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(0.0);
55     return ret;
56 }
57 
_mm256_mul_pd(__m256d a,__m256d b)58 FORCE_INLINE __m256d _mm256_mul_pd(__m256d a, __m256d b)
59 {
60     __m256d res_m256d;
61     res_m256d.vect_f64[0] = vmulq_f64(a.vect_f64[0], b.vect_f64[0]);
62     res_m256d.vect_f64[1] = vmulq_f64(a.vect_f64[1], b.vect_f64[1]);
63     return res_m256d;
64 }
65 
_mm256_add_pd(__m256d a,__m256d b)66 FORCE_INLINE __m256d _mm256_add_pd(__m256d a, __m256d b)
67 {
68     __m256d res_m256d;
69     res_m256d.vect_f64[0] = vaddq_f64(a.vect_f64[0], b.vect_f64[0]);
70     res_m256d.vect_f64[1] = vaddq_f64(a.vect_f64[1], b.vect_f64[1]);
71     return res_m256d;
72 }
73 
_mm256_sub_pd(__m256d a,__m256d b)74 FORCE_INLINE __m256d _mm256_sub_pd(__m256d a, __m256d b)
75 {
76     __m256d res_m256d;
77     res_m256d.vect_f64[0] = vsubq_f64(a.vect_f64[0], b.vect_f64[0]);
78     res_m256d.vect_f64[1] = vsubq_f64(a.vect_f64[1], b.vect_f64[1]);
79     return res_m256d;
80 }
81 
_mm256_set1_pd(double a)82 FORCE_INLINE __m256d _mm256_set1_pd(double a)
83 {
84     __m256d ret;
85     ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(a);
86     return ret;
87 }
88 
_mm256_load_pd(double const * mem_addr)89 FORCE_INLINE __m256d _mm256_load_pd (double const * mem_addr)
90 {
91     __m256d res;
92     res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
93     res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
94     return res;
95 }
_mm256_loadu_pd(double const * mem_addr)96 FORCE_INLINE __m256d _mm256_loadu_pd (double const * mem_addr)
97 {
98     __m256d res;
99     res.vect_f64[0] = vld1q_f64((const double *)mem_addr);
100     res.vect_f64[1] = vld1q_f64((const double *)mem_addr + 2);
101     return res;
102 }
103 
_mm256_castpd256_pd128(__m256d a)104 FORCE_INLINE __m128d _mm256_castpd256_pd128(__m256d a)
105 {
106     return a.vect_f64[0];
107 }
108 
_mm256_extractf128_pd(__m256d a,const int imm8)109 FORCE_INLINE __m128d _mm256_extractf128_pd (__m256d a, const int imm8)
110 {
111     assert(imm8 >= 0 && imm8 <= 1);
112     return a.vect_f64[imm8];
113 }
114 
_mm256_castpd128_pd256(__m128d a)115 FORCE_INLINE __m256d _mm256_castpd128_pd256(__m128d a)
116 {
117     __m256d res;
118     res.vect_f64[0] = a;
119     return res;
120 }
121 
122 #endif /* PF_AVX_DBL_H */
123 
124