1 /******************************************************************************
2  *
3  * Copyright (C) 2018 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 
21 /**
22 *******************************************************************************
23 * @file
24 *  ihevc_cmn_utils_neon.h
25 *
26 * @brief
27 *  Structure definitions used in the decoder
28 *
29 * @author
30 *  ittiam
31 *
32 * @par List of Functions:
33 *
34 * @remarks
35 *  None
36 *
37 *******************************************************************************
38 */
39 
40 #ifndef _IHEVC_CMN_UTILS_NEON_H_
41 #define _IHEVC_CMN_UTILS_NEON_H_
42 
43 #include <arm_neon.h>
44 #include "ihevc_platform_macros.h"
45 
46 /*****************************************************************************/
47 /* Function Definitions                                                      */
48 /*****************************************************************************/
load_unaligned_u8q(const uint8_t * buf,int stride)49 static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride)
50 {
51     uint8_t a[16];
52 
53     if(stride == 4)
54         return vld1q_u8(buf);
55     memcpy(a, buf, 4);
56     buf += stride;
57     memcpy(a + 4, buf, 4);
58     buf += stride;
59     memcpy(a + 8, buf, 4);
60     buf += stride;
61     memcpy(a + 12, buf, 4);
62     return vld1q_u8(a);
63 }
64 
load_unaligned_u8qi(const uint8_t * buf,int stride)65 static INLINE uint8x16_t load_unaligned_u8qi(const uint8_t *buf, int stride)
66 {
67     uint8_t a[16];
68     uint8_t *b = a;
69     int j;
70 
71     for(j = 0; j < 4; j++)
72     {
73         b[0] = buf[0];
74         b[1] = buf[2];
75         b[2] = buf[4];
76         b[3] = buf[6];
77         buf += stride;
78         b += 4;
79     }
80     return vld1q_u8(a);
81 }
82 
store_unaligned_u8q(uint8_t * buf,int stride,uint8x16_t b0)83 static INLINE void store_unaligned_u8q(uint8_t *buf, int stride, uint8x16_t b0)
84 {
85     uint8_t a[16];
86 
87     vst1q_u8(a, b0);
88     memcpy(buf, a, 4);
89     buf += stride;
90     memcpy(buf, a + 4, 4);
91     buf += stride;
92     memcpy(buf, a + 8, 4);
93     buf += stride;
94     memcpy(buf, a + 12, 4);
95 }
96 
vtrnq_s64_to_s16(int32x4_t a0,int32x4_t a1)97 static INLINE int16x8x2_t vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1)
98 {
99     int16x8x2_t b0;
100 
101     b0.val[0] = vcombine_s16(
102         vreinterpret_s16_s32(vget_low_s32(a0)), vreinterpret_s16_s32(vget_low_s32(a1)));
103     b0.val[1] = vcombine_s16(
104         vreinterpret_s16_s32(vget_high_s32(a0)), vreinterpret_s16_s32(vget_high_s32(a1)));
105     return b0;
106 }
107 
transpose_s16_4x4d(int16x4_t * a0,int16x4_t * a1,int16x4_t * a2,int16x4_t * a3)108 static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1, int16x4_t *a2, int16x4_t *a3)
109 {
110     // Swap 16 bit elements. Goes from:
111     // a0: 00 01 02 03
112     // a1: 10 11 12 13
113     // a2: 20 21 22 23
114     // a3: 30 31 32 33
115     // to:
116     // b0.val[0]: 00 10 02 12
117     // b0.val[1]: 01 11 03 13
118     // b1.val[0]: 20 30 22 32
119     // b1.val[1]: 21 31 23 33
120 
121     const int16x4x2_t b0 = vtrn_s16(*a0, *a1);
122     const int16x4x2_t b1 = vtrn_s16(*a2, *a3);
123 
124     // Swap 32 bit elements resulting in:
125     // c0.val[0]: 00 10 20 30
126     // c0.val[1]: 02 12 22 32
127     // c1.val[0]: 01 11 21 31
128     // c1.val[1]: 03 13 23 33
129 
130     const int32x2x2_t c0 =
131         vtrn_s32(vreinterpret_s32_s16(b0.val[0]), vreinterpret_s32_s16(b1.val[0]));
132     const int32x2x2_t c1 =
133         vtrn_s32(vreinterpret_s32_s16(b0.val[1]), vreinterpret_s32_s16(b1.val[1]));
134 
135     *a0 = vreinterpret_s16_s32(c0.val[0]);
136     *a1 = vreinterpret_s16_s32(c1.val[0]);
137     *a2 = vreinterpret_s16_s32(c0.val[1]);
138     *a3 = vreinterpret_s16_s32(c1.val[1]);
139 }
140 
transpose_s16_4x4q(int16x8_t * a0,int16x8_t * a1,int16x8_t * a2,int16x8_t * a3)141 static INLINE void transpose_s16_4x4q(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, int16x8_t *a3)
142 {
143     // Swap 16 bit elements. Goes from:
144     // a0: 00 01 02 03  04 05 06 07
145     // a1: 10 11 12 13  14 15 16 17
146     // a2: 20 21 22 23  24 25 26 27
147     // a3: 30 31 32 33  34 35 36 37
148     // to:
149     // b0.val[0]: 00 10 02 12  04 14 06 16
150     // b0.val[1]: 01 11 03 13  05 15 07 17
151     // b1.val[0]: 20 30 22 32  24 34 26 36
152     // b1.val[1]: 21 31 23 33  25 35 27 37
153 
154     const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
155     const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
156 
157     // Swap 32 bit elements resulting in:
158     // c0.val[0]: 00 10 20 30  04 14 24 34
159     // c0.val[1]: 02 12 22 32  05 15 25 35
160     // c1.val[0]: 01 11 21 31  06 16 26 36
161     // c1.val[1]: 03 13 23 33  07 17 27 37
162 
163     const int32x4x2_t c0 =
164         vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]), vreinterpretq_s32_s16(b1.val[0]));
165     const int32x4x2_t c1 =
166         vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]), vreinterpretq_s32_s16(b1.val[1]));
167 
168     *a0 = vreinterpretq_s16_s32(c0.val[0]);
169     *a1 = vreinterpretq_s16_s32(c1.val[0]);
170     *a2 = vreinterpretq_s16_s32(c0.val[1]);
171     *a3 = vreinterpretq_s16_s32(c1.val[1]);
172 }
173 
transpose_s16_8x8(int16x8_t * a0,int16x8_t * a1,int16x8_t * a2,int16x8_t * a3,int16x8_t * a4,int16x8_t * a5,int16x8_t * a6,int16x8_t * a7)174 static INLINE void transpose_s16_8x8(
175     int16x8_t *a0,
176     int16x8_t *a1,
177     int16x8_t *a2,
178     int16x8_t *a3,
179     int16x8_t *a4,
180     int16x8_t *a5,
181     int16x8_t *a6,
182     int16x8_t *a7)
183 {
184     // Swap 16 bit elements. Goes from:
185     // a0: 00 01 02 03 04 05 06 07
186     // a1: 10 11 12 13 14 15 16 17
187     // a2: 20 21 22 23 24 25 26 27
188     // a3: 30 31 32 33 34 35 36 37
189     // a4: 40 41 42 43 44 45 46 47
190     // a5: 50 51 52 53 54 55 56 57
191     // a6: 60 61 62 63 64 65 66 67
192     // a7: 70 71 72 73 74 75 76 77
193     // to:
194     // b0.val[0]: 00 10 02 12 04 14 06 16
195     // b0.val[1]: 01 11 03 13 05 15 07 17
196     // b1.val[0]: 20 30 22 32 24 34 26 36
197     // b1.val[1]: 21 31 23 33 25 35 27 37
198     // b2.val[0]: 40 50 42 52 44 54 46 56
199     // b2.val[1]: 41 51 43 53 45 55 47 57
200     // b3.val[0]: 60 70 62 72 64 74 66 76
201     // b3.val[1]: 61 71 63 73 65 75 67 77
202     int16x8x2_t b0, b1, b2, b3, d0, d1, d2, d3;
203     int32x4x2_t c0, c1, c2, c3;
204 
205     b0 = vtrnq_s16(*a0, *a1);
206     b1 = vtrnq_s16(*a2, *a3);
207     b2 = vtrnq_s16(*a4, *a5);
208     b3 = vtrnq_s16(*a6, *a7);
209 
210     // Swap 32 bit elements resulting in:
211     // c0.val[0]: 00 10 20 30 04 14 24 34
212     // c0.val[1]: 02 12 22 32 06 16 26 36
213     // c1.val[0]: 01 11 21 31 05 15 25 35
214     // c1.val[1]: 03 13 23 33 07 17 27 37
215     // c2.val[0]: 40 50 60 70 44 54 64 74
216     // c2.val[1]: 42 52 62 72 46 56 66 76
217     // c3.val[0]: 41 51 61 71 45 55 65 75
218     // c3.val[1]: 43 53 63 73 47 57 67 77
219 
220     c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]), vreinterpretq_s32_s16(b1.val[0]));
221     c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]), vreinterpretq_s32_s16(b1.val[1]));
222     c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]), vreinterpretq_s32_s16(b3.val[0]));
223     c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]), vreinterpretq_s32_s16(b3.val[1]));
224 
225     // Swap 64 bit elements resulting in:
226     // d0.val[0]: 00 10 20 30 40 50 60 70
227     // d0.val[1]: 04 14 24 34 44 54 64 74
228     // d1.val[0]: 01 11 21 31 41 51 61 71
229     // d1.val[1]: 05 15 25 35 45 55 65 75
230     // d2.val[0]: 02 12 22 32 42 52 62 72
231     // d2.val[1]: 06 16 26 36 46 56 66 76
232     // d3.val[0]: 03 13 23 33 43 53 63 73
233     // d3.val[1]: 07 17 27 37 47 57 67 77
234 
235     d0 = vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
236     d1 = vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
237     d2 = vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
238     d3 = vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
239 
240     *a0 = d0.val[0];
241     *a1 = d1.val[0];
242     *a2 = d2.val[0];
243     *a3 = d3.val[0];
244     *a4 = d0.val[1];
245     *a5 = d1.val[1];
246     *a6 = d2.val[1];
247     *a7 = d3.val[1];
248 }
249 
vtrnq_s64_to_s32(int32x4_t a0,int32x4_t a1)250 static INLINE int32x4x2_t vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1)
251 {
252     int32x4x2_t b0;
253     b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
254     b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
255     return b0;
256 }
257 
transpose_s32_4x4(int32x4_t * a0,int32x4_t * a1,int32x4_t * a2,int32x4_t * a3)258 static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1, int32x4_t *a2, int32x4_t *a3)
259 {
260     // Swap 32 bit elements. Goes from:
261     // a0: 00 01 02 03
262     // a1: 10 11 12 13
263     // a2: 20 21 22 23
264     // a3: 30 31 32 33
265     // to:
266     // b0.val[0]: 00 10 02 12
267     // b0.val[1]: 01 11 03 13
268     // b1.val[0]: 20 30 22 32
269     // b1.val[1]: 21 31 23 33
270 
271     const int32x4x2_t b0 = vtrnq_s32(*a0, *a1);
272     const int32x4x2_t b1 = vtrnq_s32(*a2, *a3);
273 
274     // Swap 64 bit elements resulting in:
275     // c0.val[0]: 00 10 20 30
276     // c0.val[1]: 02 12 22 32
277     // c1.val[0]: 01 11 21 31
278     // c1.val[1]: 03 13 23 33
279 
280     const int32x4x2_t c0 = vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
281     const int32x4x2_t c1 = vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
282 
283     *a0 = c0.val[0];
284     *a1 = c1.val[0];
285     *a2 = c0.val[1];
286     *a3 = c1.val[1];
287 }
288 
transpose_s32_8x8(int32x4x2_t * a0,int32x4x2_t * a1,int32x4x2_t * a2,int32x4x2_t * a3,int32x4x2_t * a4,int32x4x2_t * a5,int32x4x2_t * a6,int32x4x2_t * a7)289 static INLINE void transpose_s32_8x8(
290     int32x4x2_t *a0,
291     int32x4x2_t *a1,
292     int32x4x2_t *a2,
293     int32x4x2_t *a3,
294     int32x4x2_t *a4,
295     int32x4x2_t *a5,
296     int32x4x2_t *a6,
297     int32x4x2_t *a7)
298 {
299     // Swap 32 bit elements. Goes from:
300     // a0: 00 01 02 03 04 05 06 07
301     // a1: 10 11 12 13 14 15 16 17
302     // a2: 20 21 22 23 24 25 26 27
303     // a3: 30 31 32 33 34 35 36 37
304     // a4: 40 41 42 43 44 45 46 47
305     // a5: 50 51 52 53 54 55 56 57
306     // a6: 60 61 62 63 64 65 66 67
307     // a7: 70 71 72 73 74 75 76 77
308     // to:
309     // b0: 00 10 02 12 01 11 03 13
310     // b1: 20 30 22 32 21 31 23 33
311     // b2: 40 50 42 52 41 51 43 53
312     // b3: 60 70 62 72 61 71 63 73
313     // b4: 04 14 06 16 05 15 07 17
314     // b5: 24 34 26 36 25 35 27 37
315     // b6: 44 54 46 56 45 55 47 57
316     // b7: 64 74 66 76 65 75 67 77
317 
318     const int32x4x2_t b0 = vtrnq_s32(a0->val[0], a1->val[0]);
319     const int32x4x2_t b1 = vtrnq_s32(a2->val[0], a3->val[0]);
320     const int32x4x2_t b2 = vtrnq_s32(a4->val[0], a5->val[0]);
321     const int32x4x2_t b3 = vtrnq_s32(a6->val[0], a7->val[0]);
322     const int32x4x2_t b4 = vtrnq_s32(a0->val[1], a1->val[1]);
323     const int32x4x2_t b5 = vtrnq_s32(a2->val[1], a3->val[1]);
324     const int32x4x2_t b6 = vtrnq_s32(a4->val[1], a5->val[1]);
325     const int32x4x2_t b7 = vtrnq_s32(a6->val[1], a7->val[1]);
326 
327     // Swap 64 bit elements resulting in:
328     // c0: 00 10 20 30 02 12 22 32
329     // c1: 01 11 21 31 03 13 23 33
330     // c2: 40 50 60 70 42 52 62 72
331     // c3: 41 51 61 71 43 53 63 73
332     // c4: 04 14 24 34 06 16 26 36
333     // c5: 05 15 25 35 07 17 27 37
334     // c6: 44 54 64 74 46 56 66 76
335     // c7: 45 55 65 75 47 57 67 77
336     const int32x4x2_t c0 = vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
337     const int32x4x2_t c1 = vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
338     const int32x4x2_t c2 = vtrnq_s64_to_s32(b2.val[0], b3.val[0]);
339     const int32x4x2_t c3 = vtrnq_s64_to_s32(b2.val[1], b3.val[1]);
340     const int32x4x2_t c4 = vtrnq_s64_to_s32(b4.val[0], b5.val[0]);
341     const int32x4x2_t c5 = vtrnq_s64_to_s32(b4.val[1], b5.val[1]);
342     const int32x4x2_t c6 = vtrnq_s64_to_s32(b6.val[0], b7.val[0]);
343     const int32x4x2_t c7 = vtrnq_s64_to_s32(b6.val[1], b7.val[1]);
344 
345     // Swap 128 bit elements resulting in:
346     // a0: 00 10 20 30 40 50 60 70
347     // a1: 01 11 21 31 41 51 61 71
348     // a2: 02 12 22 32 42 52 62 72
349     // a3: 03 13 23 33 43 53 63 73
350     // a4: 04 14 24 34 44 54 64 74
351     // a5: 05 15 25 35 45 55 65 75
352     // a6: 06 16 26 36 46 56 66 76
353     // a7: 07 17 27 37 47 57 67 77
354     a0->val[0] = c0.val[0];
355     a0->val[1] = c2.val[0];
356     a1->val[0] = c1.val[0];
357     a1->val[1] = c3.val[0];
358     a2->val[0] = c0.val[1];
359     a2->val[1] = c2.val[1];
360     a3->val[0] = c1.val[1];
361     a3->val[1] = c3.val[1];
362     a4->val[0] = c4.val[0];
363     a4->val[1] = c6.val[0];
364     a5->val[0] = c5.val[0];
365     a5->val[1] = c7.val[0];
366     a6->val[0] = c4.val[1];
367     a6->val[1] = c6.val[1];
368     a7->val[0] = c5.val[1];
369     a7->val[1] = c7.val[1];
370 }
371 #endif /* _IHEVC_CMN_UTILS_NEON_H_ */
372