1 /******************************************************************************
2 *
3 * Copyright 2022 Google LLC
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 ******************************************************************************/
18
19 #if __ARM_NEON
20
21 #include <arm_neon.h>
22
23 #else
24 #define __ARM_NEON 1
25
26 #include <stdint.h>
27
28
29 /* ----------------------------------------------------------------------------
30 * Integer
31 * -------------------------------------------------------------------------- */
32
33 typedef struct { int16_t e[4]; } int16x4_t;
34
35 typedef struct { int16_t e[8]; } int16x8_t;
36 typedef struct { int32_t e[4]; } int32x4_t;
37 typedef struct { int64_t e[2]; } int64x2_t;
38
39
40 /**
41 * Load / Store
42 */
43
44 __attribute__((unused))
vld1_s16(const int16_t * p)45 static int16x4_t vld1_s16(const int16_t *p)
46 {
47 return (int16x4_t){ { p[0], p[1], p[2], p[3] } };
48 }
49
50
51 /**
52 * Arithmetic
53 */
54
55 __attribute__((unused))
vmull_s16(int16x4_t a,int16x4_t b)56 static int32x4_t vmull_s16(int16x4_t a, int16x4_t b)
57 {
58 return (int32x4_t){ { a.e[0] * b.e[0], a.e[1] * b.e[1],
59 a.e[2] * b.e[2], a.e[3] * b.e[3] } };
60 }
61
62 __attribute__((unused))
vmlal_s16(int32x4_t r,int16x4_t a,int16x4_t b)63 static int32x4_t vmlal_s16(int32x4_t r, int16x4_t a, int16x4_t b)
64 {
65 return (int32x4_t){ {
66 r.e[0] + a.e[0] * b.e[0], r.e[1] + a.e[1] * b.e[1],
67 r.e[2] + a.e[2] * b.e[2], r.e[3] + a.e[3] * b.e[3] } };
68 }
69
70 __attribute__((unused))
vpadalq_s32(int64x2_t a,int32x4_t b)71 static int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b)
72 {
73 int64x2_t r;
74
75 r.e[0] = a.e[0] + ((int64_t)b.e[0] + b.e[1]);
76 r.e[1] = a.e[1] + ((int64_t)b.e[2] + b.e[3]);
77
78 return r;
79 }
80
81
82 /**
83 * Reduce
84 */
85
86 __attribute__((unused))
vaddvq_s32(int32x4_t v)87 static int32_t vaddvq_s32(int32x4_t v)
88 {
89 return v.e[0] + v.e[1] + v.e[2] + v.e[3];
90 }
91
92 __attribute__((unused))
vaddvq_s64(int64x2_t v)93 static int64_t vaddvq_s64(int64x2_t v)
94 {
95 return v.e[0] + v.e[1];
96 }
97
98
99 /**
100 * Manipulation
101 */
102
103 __attribute__((unused))
vext_s16(int16x4_t a,int16x4_t b,const int n)104 static int16x4_t vext_s16(int16x4_t a, int16x4_t b, const int n)
105 {
106 int16_t x[] = { a.e[0], a.e[1], a.e[2], a.e[3],
107 b.e[0], b.e[1], b.e[2], b.e[3] };
108
109 return (int16x4_t){ { x[n], x[n+1], x[n+2], x[n+3] } };
110 }
111
112 __attribute__((unused))
vmovq_n_s32(uint32_t v)113 static int32x4_t vmovq_n_s32(uint32_t v)
114 {
115 return (int32x4_t){ { v, v, v, v } };
116 }
117
118 __attribute__((unused))
vmovq_n_s64(int64_t v)119 static int64x2_t vmovq_n_s64(int64_t v)
120 {
121 return (int64x2_t){ { v, v, } };
122 }
123
124
125
126 /* ----------------------------------------------------------------------------
127 * Floating Point
128 * -------------------------------------------------------------------------- */
129
130 typedef struct { float e[2]; } float32x2_t;
131 typedef struct { float e[4]; } float32x4_t;
132
133 typedef struct { float32x2_t val[2]; } float32x2x2_t;
134 typedef struct { float32x4_t val[2]; } float32x4x2_t;
135
136
137 /**
138 * Load / Store
139 */
140
141 __attribute__((unused))
vld1_f32(const float * p)142 static float32x2_t vld1_f32(const float *p)
143 {
144 return (float32x2_t){ { p[0], p[1] } };
145 }
146
147 __attribute__((unused))
vld1q_f32(const float * p)148 static float32x4_t vld1q_f32(const float *p)
149 {
150 return (float32x4_t){ { p[0], p[1], p[2], p[3] } };
151 }
152
153 __attribute__((unused))
vld1q_dup_f32(const float * p)154 static float32x4_t vld1q_dup_f32(const float *p)
155 {
156 return (float32x4_t){ { p[0], p[0], p[0], p[0] } };
157 }
158
159 __attribute__((unused))
vld2_f32(const float * p)160 static float32x2x2_t vld2_f32(const float *p)
161 {
162 return (float32x2x2_t){ .val[0] = { { p[0], p[2] } },
163 .val[1] = { { p[1], p[3] } } };
164 }
165
166 __attribute__((unused))
vld2q_f32(const float * p)167 static float32x4x2_t vld2q_f32(const float *p)
168 {
169 return (float32x4x2_t){ .val[0] = { { p[0], p[2], p[4], p[6] } },
170 .val[1] = { { p[1], p[3], p[5], p[7] } } };
171 }
172
173 __attribute__((unused))
vst1_f32(float * p,float32x2_t v)174 static void vst1_f32(float *p, float32x2_t v)
175 {
176 p[0] = v.e[0], p[1] = v.e[1];
177 }
178
179 __attribute__((unused))
vst1q_f32(float * p,float32x4_t v)180 static void vst1q_f32(float *p, float32x4_t v)
181 {
182 p[0] = v.e[0], p[1] = v.e[1], p[2] = v.e[2], p[3] = v.e[3];
183 }
184
185 /**
186 * Arithmetic
187 */
188
189 __attribute__((unused))
vneg_f32(float32x2_t a)190 static float32x2_t vneg_f32(float32x2_t a)
191 {
192 return (float32x2_t){ { -a.e[0], -a.e[1] } };
193 }
194
195 __attribute__((unused))
vnegq_f32(float32x4_t a)196 static float32x4_t vnegq_f32(float32x4_t a)
197 {
198 return (float32x4_t){ { -a.e[0], -a.e[1], -a.e[2], -a.e[3] } };
199 }
200
201 __attribute__((unused))
vaddq_f32(float32x4_t a,float32x4_t b)202 static float32x4_t vaddq_f32(float32x4_t a, float32x4_t b)
203 {
204 return (float32x4_t){ { a.e[0] + b.e[0], a.e[1] + b.e[1],
205 a.e[2] + b.e[2], a.e[3] + b.e[3] } };
206 }
207
208 __attribute__((unused))
vsubq_f32(float32x4_t a,float32x4_t b)209 static float32x4_t vsubq_f32(float32x4_t a, float32x4_t b)
210 {
211 return (float32x4_t){ { a.e[0] - b.e[0], a.e[1] - b.e[1],
212 a.e[2] - b.e[2], a.e[3] - b.e[3] } };
213 }
214
215 __attribute__((unused))
vfma_f32(float32x2_t a,float32x2_t b,float32x2_t c)216 static float32x2_t vfma_f32(float32x2_t a, float32x2_t b, float32x2_t c)
217 {
218 return (float32x2_t){ {
219 a.e[0] + b.e[0] * c.e[0], a.e[1] + b.e[1] * c.e[1] } };
220 }
221
222 __attribute__((unused))
vfmaq_f32(float32x4_t a,float32x4_t b,float32x4_t c)223 static float32x4_t vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c)
224 {
225 return (float32x4_t){ {
226 a.e[0] + b.e[0] * c.e[0], a.e[1] + b.e[1] * c.e[1],
227 a.e[2] + b.e[2] * c.e[2], a.e[3] + b.e[3] * c.e[3] } };
228 }
229
230 __attribute__((unused))
vfms_f32(float32x2_t a,float32x2_t b,float32x2_t c)231 static float32x2_t vfms_f32(float32x2_t a, float32x2_t b, float32x2_t c)
232 {
233 return (float32x2_t){ {
234 a.e[0] - b.e[0] * c.e[0], a.e[1] - b.e[1] * c.e[1] } };
235 }
236
237 __attribute__((unused))
vfmsq_f32(float32x4_t a,float32x4_t b,float32x4_t c)238 static float32x4_t vfmsq_f32(float32x4_t a, float32x4_t b, float32x4_t c)
239 {
240 return (float32x4_t){ {
241 a.e[0] - b.e[0] * c.e[0], a.e[1] - b.e[1] * c.e[1],
242 a.e[2] - b.e[2] * c.e[2], a.e[3] - b.e[3] * c.e[3] } };
243 }
244
245
246 /**
247 * Manipulation
248 */
249
250 __attribute__((unused))
vcreate_f32(uint64_t u)251 static float32x2_t vcreate_f32(uint64_t u)
252 {
253 float *f = (float *)&u;
254 return (float32x2_t){ { f[0] , f[1] } };
255 }
256
257 __attribute__((unused))
vcombine_f32(float32x2_t a,float32x2_t b)258 static float32x4_t vcombine_f32(float32x2_t a, float32x2_t b)
259 {
260 return (float32x4_t){ { a.e[0], a.e[1], b.e[0], b.e[1] } };
261 }
262
263 __attribute__((unused))
vget_low_f32(float32x4_t a)264 static float32x2_t vget_low_f32(float32x4_t a)
265 {
266 return (float32x2_t){ { a.e[0], a.e[1] } };
267 }
268
269 __attribute__((unused))
vget_high_f32(float32x4_t a)270 static float32x2_t vget_high_f32(float32x4_t a)
271 {
272 return (float32x2_t){ { a.e[2], a.e[3] } };
273 }
274
275 __attribute__((unused))
vmovq_n_f32(float v)276 static float32x4_t vmovq_n_f32(float v)
277 {
278 return (float32x4_t){ { v, v, v, v } };
279 }
280
281 __attribute__((unused))
vrev64_f32(float32x2_t v)282 static float32x2_t vrev64_f32(float32x2_t v)
283 {
284 return (float32x2_t){ { v.e[1], v.e[0] } };
285 }
286
287 __attribute__((unused))
vrev64q_f32(float32x4_t v)288 static float32x4_t vrev64q_f32(float32x4_t v)
289 {
290 return (float32x4_t){ { v.e[1], v.e[0], v.e[3], v.e[2] } };
291 }
292
293 __attribute__((unused))
vtrn1_f32(float32x2_t a,float32x2_t b)294 static float32x2_t vtrn1_f32(float32x2_t a, float32x2_t b)
295 {
296 return (float32x2_t){ { a.e[0], b.e[0] } };
297 }
298
299 __attribute__((unused))
vtrn2_f32(float32x2_t a,float32x2_t b)300 static float32x2_t vtrn2_f32(float32x2_t a, float32x2_t b)
301 {
302 return (float32x2_t){ { a.e[1], b.e[1] } };
303 }
304
305 __attribute__((unused))
vtrn1q_f32(float32x4_t a,float32x4_t b)306 static float32x4_t vtrn1q_f32(float32x4_t a, float32x4_t b)
307 {
308 return (float32x4_t){ { a.e[0], b.e[0], a.e[2], b.e[2] } };
309 }
310
311 __attribute__((unused))
vtrn2q_f32(float32x4_t a,float32x4_t b)312 static float32x4_t vtrn2q_f32(float32x4_t a, float32x4_t b)
313 {
314 return (float32x4_t){ { a.e[1], b.e[1], a.e[3], b.e[3] } };
315 }
316
317 __attribute__((unused))
vzip1q_f32(float32x4_t a,float32x4_t b)318 static float32x4_t vzip1q_f32(float32x4_t a, float32x4_t b)
319 {
320 return (float32x4_t){ { a.e[0], b.e[0], a.e[1], b.e[1] } };
321 }
322
323 __attribute__((unused))
vzip2q_f32(float32x4_t a,float32x4_t b)324 static float32x4_t vzip2q_f32(float32x4_t a, float32x4_t b)
325 {
326 return (float32x4_t){ { a.e[2], b.e[2], a.e[3], b.e[3] } };
327 }
328
329
330 #endif /* __ARM_NEON */
331