1 /******************************************************************************
2 *
3 * Copyright (C) 2018 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20
21 /**
22 *******************************************************************************
23 * @file
24 * ihevc_cmn_utils_neon.h
25 *
26 * @brief
27 * Structure definitions used in the decoder
28 *
29 * @author
30 * ittiam
31 *
32 * @par List of Functions:
33 *
34 * @remarks
35 * None
36 *
37 *******************************************************************************
38 */
39
40 #ifndef _IHEVC_CMN_UTILS_NEON_H_
41 #define _IHEVC_CMN_UTILS_NEON_H_
42
43 #include <arm_neon.h>
44 #include "ihevc_platform_macros.h"
45
46 /*****************************************************************************/
47 /* Function Definitions */
48 /*****************************************************************************/
load_unaligned_u8q(const uint8_t * buf,int stride)49 static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride)
50 {
51 uint8_t a[16];
52
53 if(stride == 4)
54 return vld1q_u8(buf);
55 memcpy(a, buf, 4);
56 buf += stride;
57 memcpy(a + 4, buf, 4);
58 buf += stride;
59 memcpy(a + 8, buf, 4);
60 buf += stride;
61 memcpy(a + 12, buf, 4);
62 return vld1q_u8(a);
63 }
64
load_unaligned_u8qi(const uint8_t * buf,int stride)65 static INLINE uint8x16_t load_unaligned_u8qi(const uint8_t *buf, int stride)
66 {
67 uint8_t a[16];
68 uint8_t *b = a;
69 int j;
70
71 for(j = 0; j < 4; j++)
72 {
73 b[0] = buf[0];
74 b[1] = buf[2];
75 b[2] = buf[4];
76 b[3] = buf[6];
77 buf += stride;
78 b += 4;
79 }
80 return vld1q_u8(a);
81 }
82
store_unaligned_u8q(uint8_t * buf,int stride,uint8x16_t b0)83 static INLINE void store_unaligned_u8q(uint8_t *buf, int stride, uint8x16_t b0)
84 {
85 uint8_t a[16];
86
87 vst1q_u8(a, b0);
88 memcpy(buf, a, 4);
89 buf += stride;
90 memcpy(buf, a + 4, 4);
91 buf += stride;
92 memcpy(buf, a + 8, 4);
93 buf += stride;
94 memcpy(buf, a + 12, 4);
95 }
96
vtrnq_s64_to_s16(int32x4_t a0,int32x4_t a1)97 static INLINE int16x8x2_t vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1)
98 {
99 int16x8x2_t b0;
100
101 b0.val[0] = vcombine_s16(
102 vreinterpret_s16_s32(vget_low_s32(a0)), vreinterpret_s16_s32(vget_low_s32(a1)));
103 b0.val[1] = vcombine_s16(
104 vreinterpret_s16_s32(vget_high_s32(a0)), vreinterpret_s16_s32(vget_high_s32(a1)));
105 return b0;
106 }
107
transpose_s16_4x4d(int16x4_t * a0,int16x4_t * a1,int16x4_t * a2,int16x4_t * a3)108 static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1, int16x4_t *a2, int16x4_t *a3)
109 {
110 // Swap 16 bit elements. Goes from:
111 // a0: 00 01 02 03
112 // a1: 10 11 12 13
113 // a2: 20 21 22 23
114 // a3: 30 31 32 33
115 // to:
116 // b0.val[0]: 00 10 02 12
117 // b0.val[1]: 01 11 03 13
118 // b1.val[0]: 20 30 22 32
119 // b1.val[1]: 21 31 23 33
120
121 const int16x4x2_t b0 = vtrn_s16(*a0, *a1);
122 const int16x4x2_t b1 = vtrn_s16(*a2, *a3);
123
124 // Swap 32 bit elements resulting in:
125 // c0.val[0]: 00 10 20 30
126 // c0.val[1]: 02 12 22 32
127 // c1.val[0]: 01 11 21 31
128 // c1.val[1]: 03 13 23 33
129
130 const int32x2x2_t c0 =
131 vtrn_s32(vreinterpret_s32_s16(b0.val[0]), vreinterpret_s32_s16(b1.val[0]));
132 const int32x2x2_t c1 =
133 vtrn_s32(vreinterpret_s32_s16(b0.val[1]), vreinterpret_s32_s16(b1.val[1]));
134
135 *a0 = vreinterpret_s16_s32(c0.val[0]);
136 *a1 = vreinterpret_s16_s32(c1.val[0]);
137 *a2 = vreinterpret_s16_s32(c0.val[1]);
138 *a3 = vreinterpret_s16_s32(c1.val[1]);
139 }
140
transpose_s16_4x4q(int16x8_t * a0,int16x8_t * a1,int16x8_t * a2,int16x8_t * a3)141 static INLINE void transpose_s16_4x4q(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, int16x8_t *a3)
142 {
143 // Swap 16 bit elements. Goes from:
144 // a0: 00 01 02 03 04 05 06 07
145 // a1: 10 11 12 13 14 15 16 17
146 // a2: 20 21 22 23 24 25 26 27
147 // a3: 30 31 32 33 34 35 36 37
148 // to:
149 // b0.val[0]: 00 10 02 12 04 14 06 16
150 // b0.val[1]: 01 11 03 13 05 15 07 17
151 // b1.val[0]: 20 30 22 32 24 34 26 36
152 // b1.val[1]: 21 31 23 33 25 35 27 37
153
154 const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
155 const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
156
157 // Swap 32 bit elements resulting in:
158 // c0.val[0]: 00 10 20 30 04 14 24 34
159 // c0.val[1]: 02 12 22 32 05 15 25 35
160 // c1.val[0]: 01 11 21 31 06 16 26 36
161 // c1.val[1]: 03 13 23 33 07 17 27 37
162
163 const int32x4x2_t c0 =
164 vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]), vreinterpretq_s32_s16(b1.val[0]));
165 const int32x4x2_t c1 =
166 vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]), vreinterpretq_s32_s16(b1.val[1]));
167
168 *a0 = vreinterpretq_s16_s32(c0.val[0]);
169 *a1 = vreinterpretq_s16_s32(c1.val[0]);
170 *a2 = vreinterpretq_s16_s32(c0.val[1]);
171 *a3 = vreinterpretq_s16_s32(c1.val[1]);
172 }
173
transpose_s16_8x8(int16x8_t * a0,int16x8_t * a1,int16x8_t * a2,int16x8_t * a3,int16x8_t * a4,int16x8_t * a5,int16x8_t * a6,int16x8_t * a7)174 static INLINE void transpose_s16_8x8(
175 int16x8_t *a0,
176 int16x8_t *a1,
177 int16x8_t *a2,
178 int16x8_t *a3,
179 int16x8_t *a4,
180 int16x8_t *a5,
181 int16x8_t *a6,
182 int16x8_t *a7)
183 {
184 // Swap 16 bit elements. Goes from:
185 // a0: 00 01 02 03 04 05 06 07
186 // a1: 10 11 12 13 14 15 16 17
187 // a2: 20 21 22 23 24 25 26 27
188 // a3: 30 31 32 33 34 35 36 37
189 // a4: 40 41 42 43 44 45 46 47
190 // a5: 50 51 52 53 54 55 56 57
191 // a6: 60 61 62 63 64 65 66 67
192 // a7: 70 71 72 73 74 75 76 77
193 // to:
194 // b0.val[0]: 00 10 02 12 04 14 06 16
195 // b0.val[1]: 01 11 03 13 05 15 07 17
196 // b1.val[0]: 20 30 22 32 24 34 26 36
197 // b1.val[1]: 21 31 23 33 25 35 27 37
198 // b2.val[0]: 40 50 42 52 44 54 46 56
199 // b2.val[1]: 41 51 43 53 45 55 47 57
200 // b3.val[0]: 60 70 62 72 64 74 66 76
201 // b3.val[1]: 61 71 63 73 65 75 67 77
202 int16x8x2_t b0, b1, b2, b3, d0, d1, d2, d3;
203 int32x4x2_t c0, c1, c2, c3;
204
205 b0 = vtrnq_s16(*a0, *a1);
206 b1 = vtrnq_s16(*a2, *a3);
207 b2 = vtrnq_s16(*a4, *a5);
208 b3 = vtrnq_s16(*a6, *a7);
209
210 // Swap 32 bit elements resulting in:
211 // c0.val[0]: 00 10 20 30 04 14 24 34
212 // c0.val[1]: 02 12 22 32 06 16 26 36
213 // c1.val[0]: 01 11 21 31 05 15 25 35
214 // c1.val[1]: 03 13 23 33 07 17 27 37
215 // c2.val[0]: 40 50 60 70 44 54 64 74
216 // c2.val[1]: 42 52 62 72 46 56 66 76
217 // c3.val[0]: 41 51 61 71 45 55 65 75
218 // c3.val[1]: 43 53 63 73 47 57 67 77
219
220 c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]), vreinterpretq_s32_s16(b1.val[0]));
221 c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]), vreinterpretq_s32_s16(b1.val[1]));
222 c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]), vreinterpretq_s32_s16(b3.val[0]));
223 c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]), vreinterpretq_s32_s16(b3.val[1]));
224
225 // Swap 64 bit elements resulting in:
226 // d0.val[0]: 00 10 20 30 40 50 60 70
227 // d0.val[1]: 04 14 24 34 44 54 64 74
228 // d1.val[0]: 01 11 21 31 41 51 61 71
229 // d1.val[1]: 05 15 25 35 45 55 65 75
230 // d2.val[0]: 02 12 22 32 42 52 62 72
231 // d2.val[1]: 06 16 26 36 46 56 66 76
232 // d3.val[0]: 03 13 23 33 43 53 63 73
233 // d3.val[1]: 07 17 27 37 47 57 67 77
234
235 d0 = vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
236 d1 = vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
237 d2 = vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
238 d3 = vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
239
240 *a0 = d0.val[0];
241 *a1 = d1.val[0];
242 *a2 = d2.val[0];
243 *a3 = d3.val[0];
244 *a4 = d0.val[1];
245 *a5 = d1.val[1];
246 *a6 = d2.val[1];
247 *a7 = d3.val[1];
248 }
249
vtrnq_s64_to_s32(int32x4_t a0,int32x4_t a1)250 static INLINE int32x4x2_t vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1)
251 {
252 int32x4x2_t b0;
253 b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
254 b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
255 return b0;
256 }
257
transpose_s32_4x4(int32x4_t * a0,int32x4_t * a1,int32x4_t * a2,int32x4_t * a3)258 static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1, int32x4_t *a2, int32x4_t *a3)
259 {
260 // Swap 32 bit elements. Goes from:
261 // a0: 00 01 02 03
262 // a1: 10 11 12 13
263 // a2: 20 21 22 23
264 // a3: 30 31 32 33
265 // to:
266 // b0.val[0]: 00 10 02 12
267 // b0.val[1]: 01 11 03 13
268 // b1.val[0]: 20 30 22 32
269 // b1.val[1]: 21 31 23 33
270
271 const int32x4x2_t b0 = vtrnq_s32(*a0, *a1);
272 const int32x4x2_t b1 = vtrnq_s32(*a2, *a3);
273
274 // Swap 64 bit elements resulting in:
275 // c0.val[0]: 00 10 20 30
276 // c0.val[1]: 02 12 22 32
277 // c1.val[0]: 01 11 21 31
278 // c1.val[1]: 03 13 23 33
279
280 const int32x4x2_t c0 = vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
281 const int32x4x2_t c1 = vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
282
283 *a0 = c0.val[0];
284 *a1 = c1.val[0];
285 *a2 = c0.val[1];
286 *a3 = c1.val[1];
287 }
288
transpose_s32_8x8(int32x4x2_t * a0,int32x4x2_t * a1,int32x4x2_t * a2,int32x4x2_t * a3,int32x4x2_t * a4,int32x4x2_t * a5,int32x4x2_t * a6,int32x4x2_t * a7)289 static INLINE void transpose_s32_8x8(
290 int32x4x2_t *a0,
291 int32x4x2_t *a1,
292 int32x4x2_t *a2,
293 int32x4x2_t *a3,
294 int32x4x2_t *a4,
295 int32x4x2_t *a5,
296 int32x4x2_t *a6,
297 int32x4x2_t *a7)
298 {
299 // Swap 32 bit elements. Goes from:
300 // a0: 00 01 02 03 04 05 06 07
301 // a1: 10 11 12 13 14 15 16 17
302 // a2: 20 21 22 23 24 25 26 27
303 // a3: 30 31 32 33 34 35 36 37
304 // a4: 40 41 42 43 44 45 46 47
305 // a5: 50 51 52 53 54 55 56 57
306 // a6: 60 61 62 63 64 65 66 67
307 // a7: 70 71 72 73 74 75 76 77
308 // to:
309 // b0: 00 10 02 12 01 11 03 13
310 // b1: 20 30 22 32 21 31 23 33
311 // b2: 40 50 42 52 41 51 43 53
312 // b3: 60 70 62 72 61 71 63 73
313 // b4: 04 14 06 16 05 15 07 17
314 // b5: 24 34 26 36 25 35 27 37
315 // b6: 44 54 46 56 45 55 47 57
316 // b7: 64 74 66 76 65 75 67 77
317
318 const int32x4x2_t b0 = vtrnq_s32(a0->val[0], a1->val[0]);
319 const int32x4x2_t b1 = vtrnq_s32(a2->val[0], a3->val[0]);
320 const int32x4x2_t b2 = vtrnq_s32(a4->val[0], a5->val[0]);
321 const int32x4x2_t b3 = vtrnq_s32(a6->val[0], a7->val[0]);
322 const int32x4x2_t b4 = vtrnq_s32(a0->val[1], a1->val[1]);
323 const int32x4x2_t b5 = vtrnq_s32(a2->val[1], a3->val[1]);
324 const int32x4x2_t b6 = vtrnq_s32(a4->val[1], a5->val[1]);
325 const int32x4x2_t b7 = vtrnq_s32(a6->val[1], a7->val[1]);
326
327 // Swap 64 bit elements resulting in:
328 // c0: 00 10 20 30 02 12 22 32
329 // c1: 01 11 21 31 03 13 23 33
330 // c2: 40 50 60 70 42 52 62 72
331 // c3: 41 51 61 71 43 53 63 73
332 // c4: 04 14 24 34 06 16 26 36
333 // c5: 05 15 25 35 07 17 27 37
334 // c6: 44 54 64 74 46 56 66 76
335 // c7: 45 55 65 75 47 57 67 77
336 const int32x4x2_t c0 = vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
337 const int32x4x2_t c1 = vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
338 const int32x4x2_t c2 = vtrnq_s64_to_s32(b2.val[0], b3.val[0]);
339 const int32x4x2_t c3 = vtrnq_s64_to_s32(b2.val[1], b3.val[1]);
340 const int32x4x2_t c4 = vtrnq_s64_to_s32(b4.val[0], b5.val[0]);
341 const int32x4x2_t c5 = vtrnq_s64_to_s32(b4.val[1], b5.val[1]);
342 const int32x4x2_t c6 = vtrnq_s64_to_s32(b6.val[0], b7.val[0]);
343 const int32x4x2_t c7 = vtrnq_s64_to_s32(b6.val[1], b7.val[1]);
344
345 // Swap 128 bit elements resulting in:
346 // a0: 00 10 20 30 40 50 60 70
347 // a1: 01 11 21 31 41 51 61 71
348 // a2: 02 12 22 32 42 52 62 72
349 // a3: 03 13 23 33 43 53 63 73
350 // a4: 04 14 24 34 44 54 64 74
351 // a5: 05 15 25 35 45 55 65 75
352 // a6: 06 16 26 36 46 56 66 76
353 // a7: 07 17 27 37 47 57 67 77
354 a0->val[0] = c0.val[0];
355 a0->val[1] = c2.val[0];
356 a1->val[0] = c1.val[0];
357 a1->val[1] = c3.val[0];
358 a2->val[0] = c0.val[1];
359 a2->val[1] = c2.val[1];
360 a3->val[0] = c1.val[1];
361 a3->val[1] = c3.val[1];
362 a4->val[0] = c4.val[0];
363 a4->val[1] = c6.val[0];
364 a5->val[0] = c5.val[0];
365 a5->val[1] = c7.val[0];
366 a6->val[0] = c4.val[1];
367 a6->val[1] = c6.val[1];
368 a7->val[0] = c5.val[1];
369 a7->val[1] = c7.val[1];
370 }
371 #endif /* _IHEVC_CMN_UTILS_NEON_H_ */
372