1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #ifndef AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_
12 #define AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_
13
14 #include <arm_neon.h>
15
transpose_u8_8x8(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3,uint8x8_t * a4,uint8x8_t * a5,uint8x8_t * a6,uint8x8_t * a7)16 static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
17 uint8x8_t *a3, uint8x8_t *a4, uint8x8_t *a5,
18 uint8x8_t *a6, uint8x8_t *a7) {
19 // Swap 8 bit elements. Goes from:
20 // a0: 00 01 02 03 04 05 06 07
21 // a1: 10 11 12 13 14 15 16 17
22 // a2: 20 21 22 23 24 25 26 27
23 // a3: 30 31 32 33 34 35 36 37
24 // a4: 40 41 42 43 44 45 46 47
25 // a5: 50 51 52 53 54 55 56 57
26 // a6: 60 61 62 63 64 65 66 67
27 // a7: 70 71 72 73 74 75 76 77
28 // to:
29 // b0.val[0]: 00 10 02 12 04 14 06 16 40 50 42 52 44 54 46 56
30 // b0.val[1]: 01 11 03 13 05 15 07 17 41 51 43 53 45 55 47 57
31 // b1.val[0]: 20 30 22 32 24 34 26 36 60 70 62 72 64 74 66 76
32 // b1.val[1]: 21 31 23 33 25 35 27 37 61 71 63 73 65 75 67 77
33
34 const uint8x16x2_t b0 =
35 vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5));
36 const uint8x16x2_t b1 =
37 vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7));
38
39 // Swap 16 bit elements resulting in:
40 // c0.val[0]: 00 10 20 30 04 14 24 34 40 50 60 70 44 54 64 74
41 // c0.val[1]: 02 12 22 32 06 16 26 36 42 52 62 72 46 56 66 76
42 // c1.val[0]: 01 11 21 31 05 15 25 35 41 51 61 71 45 55 65 75
43 // c1.val[1]: 03 13 23 33 07 17 27 37 43 53 63 73 47 57 67 77
44
45 const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
46 vreinterpretq_u16_u8(b1.val[0]));
47 const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
48 vreinterpretq_u16_u8(b1.val[1]));
49
50 // Unzip 32 bit elements resulting in:
51 // d0.val[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
52 // d0.val[1]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
53 // d1.val[0]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
54 // d1.val[1]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
55 const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
56 vreinterpretq_u32_u16(c1.val[0]));
57 const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
58 vreinterpretq_u32_u16(c1.val[1]));
59
60 *a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
61 *a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
62 *a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
63 *a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
64 *a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
65 *a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
66 *a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
67 *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
68 }
69
transpose_u8_8x4(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3)70 static INLINE void transpose_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
71 uint8x8_t *a3) {
72 // Swap 8 bit elements. Goes from:
73 // a0: 00 01 02 03 04 05 06 07
74 // a1: 10 11 12 13 14 15 16 17
75 // a2: 20 21 22 23 24 25 26 27
76 // a3: 30 31 32 33 34 35 36 37
77 // to:
78 // b0.val[0]: 00 10 02 12 04 14 06 16
79 // b0.val[1]: 01 11 03 13 05 15 07 17
80 // b1.val[0]: 20 30 22 32 24 34 26 36
81 // b1.val[1]: 21 31 23 33 25 35 27 37
82
83 const uint8x8x2_t b0 = vtrn_u8(*a0, *a1);
84 const uint8x8x2_t b1 = vtrn_u8(*a2, *a3);
85
86 // Swap 16 bit elements resulting in:
87 // c0.val[0]: 00 10 20 30 04 14 24 34
88 // c0.val[1]: 02 12 22 32 06 16 26 36
89 // c1.val[0]: 01 11 21 31 05 15 25 35
90 // c1.val[1]: 03 13 23 33 07 17 27 37
91
92 const uint16x4x2_t c0 =
93 vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0]));
94 const uint16x4x2_t c1 =
95 vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1]));
96
97 *a0 = vreinterpret_u8_u16(c0.val[0]);
98 *a1 = vreinterpret_u8_u16(c1.val[0]);
99 *a2 = vreinterpret_u8_u16(c0.val[1]);
100 *a3 = vreinterpret_u8_u16(c1.val[1]);
101 }
102
transpose_u8_4x4(uint8x8_t * a0,uint8x8_t * a1)103 static INLINE void transpose_u8_4x4(uint8x8_t *a0, uint8x8_t *a1) {
104 // Swap 16 bit elements. Goes from:
105 // a0: 00 01 02 03 10 11 12 13
106 // a1: 20 21 22 23 30 31 32 33
107 // to:
108 // b0.val[0]: 00 01 20 21 10 11 30 31
109 // b0.val[1]: 02 03 22 23 12 13 32 33
110
111 const uint16x4x2_t b0 =
112 vtrn_u16(vreinterpret_u16_u8(*a0), vreinterpret_u16_u8(*a1));
113
114 // Swap 32 bit elements resulting in:
115 // c0.val[0]: 00 01 20 21 02 03 22 23
116 // c0.val[1]: 10 11 30 31 12 13 32 33
117
118 const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
119 vreinterpret_u32_u16(b0.val[1]));
120
121 // Swap 8 bit elements resulting in:
122 // d0.val[0]: 00 10 20 30 02 12 22 32
123 // d0.val[1]: 01 11 21 31 03 13 23 33
124
125 const uint8x8x2_t d0 =
126 vtrn_u8(vreinterpret_u8_u32(c0.val[0]), vreinterpret_u8_u32(c0.val[1]));
127
128 *a0 = d0.val[0];
129 *a1 = d0.val[1];
130 }
131
transpose_u8_4x8(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3,const uint8x8_t a4,const uint8x8_t a5,const uint8x8_t a6,const uint8x8_t a7)132 static INLINE void transpose_u8_4x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
133 uint8x8_t *a3, const uint8x8_t a4,
134 const uint8x8_t a5, const uint8x8_t a6,
135 const uint8x8_t a7) {
136 // Swap 32 bit elements. Goes from:
137 // a0: 00 01 02 03 XX XX XX XX
138 // a1: 10 11 12 13 XX XX XX XX
139 // a2: 20 21 22 23 XX XX XX XX
140 // a3; 30 31 32 33 XX XX XX XX
141 // a4: 40 41 42 43 XX XX XX XX
142 // a5: 50 51 52 53 XX XX XX XX
143 // a6: 60 61 62 63 XX XX XX XX
144 // a7: 70 71 72 73 XX XX XX XX
145 // to:
146 // b0.val[0]: 00 01 02 03 40 41 42 43
147 // b1.val[0]: 10 11 12 13 50 51 52 53
148 // b2.val[0]: 20 21 22 23 60 61 62 63
149 // b3.val[0]: 30 31 32 33 70 71 72 73
150
151 const uint32x2x2_t b0 =
152 vtrn_u32(vreinterpret_u32_u8(*a0), vreinterpret_u32_u8(a4));
153 const uint32x2x2_t b1 =
154 vtrn_u32(vreinterpret_u32_u8(*a1), vreinterpret_u32_u8(a5));
155 const uint32x2x2_t b2 =
156 vtrn_u32(vreinterpret_u32_u8(*a2), vreinterpret_u32_u8(a6));
157 const uint32x2x2_t b3 =
158 vtrn_u32(vreinterpret_u32_u8(*a3), vreinterpret_u32_u8(a7));
159
160 // Swap 16 bit elements resulting in:
161 // c0.val[0]: 00 01 20 21 40 41 60 61
162 // c0.val[1]: 02 03 22 23 42 43 62 63
163 // c1.val[0]: 10 11 30 31 50 51 70 71
164 // c1.val[1]: 12 13 32 33 52 53 72 73
165
166 const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]),
167 vreinterpret_u16_u32(b2.val[0]));
168 const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]),
169 vreinterpret_u16_u32(b3.val[0]));
170
171 // Swap 8 bit elements resulting in:
172 // d0.val[0]: 00 10 20 30 40 50 60 70
173 // d0.val[1]: 01 11 21 31 41 51 61 71
174 // d1.val[0]: 02 12 22 32 42 52 62 72
175 // d1.val[1]: 03 13 23 33 43 53 63 73
176
177 const uint8x8x2_t d0 =
178 vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0]));
179 const uint8x8x2_t d1 =
180 vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1]));
181
182 *a0 = d0.val[0];
183 *a1 = d0.val[1];
184 *a2 = d1.val[0];
185 *a3 = d1.val[1];
186 }
187
transpose_u16_4x8(uint16x4_t * a0,uint16x4_t * a1,uint16x4_t * a2,uint16x4_t * a3,uint16x4_t * a4,uint16x4_t * a5,uint16x4_t * a6,uint16x4_t * a7,uint16x8_t * o0,uint16x8_t * o1,uint16x8_t * o2,uint16x8_t * o3)188 static INLINE void transpose_u16_4x8(uint16x4_t *a0, uint16x4_t *a1,
189 uint16x4_t *a2, uint16x4_t *a3,
190 uint16x4_t *a4, uint16x4_t *a5,
191 uint16x4_t *a6, uint16x4_t *a7,
192 uint16x8_t *o0, uint16x8_t *o1,
193 uint16x8_t *o2, uint16x8_t *o3) {
194 // Swap 16 bit elements. Goes from:
195 // a0: 00 01 02 03
196 // a1: 10 11 12 13
197 // a2: 20 21 22 23
198 // a3: 30 31 32 33
199 // a4: 40 41 42 43
200 // a5: 50 51 52 53
201 // a6: 60 61 62 63
202 // a7: 70 71 72 73
203 // to:
204 // b0.val[0]: 00 10 02 12
205 // b0.val[1]: 01 11 03 13
206 // b1.val[0]: 20 30 22 32
207 // b1.val[1]: 21 31 23 33
208 // b2.val[0]: 40 50 42 52
209 // b2.val[1]: 41 51 43 53
210 // b3.val[0]: 60 70 62 72
211 // b3.val[1]: 61 71 63 73
212
213 uint16x4x2_t b0 = vtrn_u16(*a0, *a1);
214 uint16x4x2_t b1 = vtrn_u16(*a2, *a3);
215 uint16x4x2_t b2 = vtrn_u16(*a4, *a5);
216 uint16x4x2_t b3 = vtrn_u16(*a6, *a7);
217
218 // Swap 32 bit elements resulting in:
219 // c0.val[0]: 00 10 20 30
220 // c0.val[1]: 02 12 22 32
221 // c1.val[0]: 01 11 21 31
222 // c1.val[1]: 03 13 23 33
223 // c2.val[0]: 40 50 60 70
224 // c2.val[1]: 42 52 62 72
225 // c3.val[0]: 41 51 61 71
226 // c3.val[1]: 43 53 63 73
227
228 uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
229 vreinterpret_u32_u16(b1.val[0]));
230 uint32x2x2_t c1 = vtrn_u32(vreinterpret_u32_u16(b0.val[1]),
231 vreinterpret_u32_u16(b1.val[1]));
232 uint32x2x2_t c2 = vtrn_u32(vreinterpret_u32_u16(b2.val[0]),
233 vreinterpret_u32_u16(b3.val[0]));
234 uint32x2x2_t c3 = vtrn_u32(vreinterpret_u32_u16(b2.val[1]),
235 vreinterpret_u32_u16(b3.val[1]));
236
237 // Swap 64 bit elements resulting in:
238 // o0: 00 10 20 30 40 50 60 70
239 // o1: 01 11 21 31 41 51 61 71
240 // o2: 02 12 22 32 42 52 62 72
241 // o3: 03 13 23 33 43 53 63 73
242
243 *o0 = vcombine_u16(vreinterpret_u16_u32(c0.val[0]),
244 vreinterpret_u16_u32(c2.val[0]));
245 *o1 = vcombine_u16(vreinterpret_u16_u32(c1.val[0]),
246 vreinterpret_u16_u32(c3.val[0]));
247 *o2 = vcombine_u16(vreinterpret_u16_u32(c0.val[1]),
248 vreinterpret_u16_u32(c2.val[1]));
249 *o3 = vcombine_u16(vreinterpret_u16_u32(c1.val[1]),
250 vreinterpret_u16_u32(c3.val[1]));
251 }
252
transpose_s16_4x8(int16x4_t * a0,int16x4_t * a1,int16x4_t * a2,int16x4_t * a3,int16x4_t * a4,int16x4_t * a5,int16x4_t * a6,int16x4_t * a7,int16x8_t * o0,int16x8_t * o1,int16x8_t * o2,int16x8_t * o3)253 static INLINE void transpose_s16_4x8(int16x4_t *a0, int16x4_t *a1,
254 int16x4_t *a2, int16x4_t *a3,
255 int16x4_t *a4, int16x4_t *a5,
256 int16x4_t *a6, int16x4_t *a7,
257 int16x8_t *o0, int16x8_t *o1,
258 int16x8_t *o2, int16x8_t *o3) {
259 // Swap 16 bit elements. Goes from:
260 // a0: 00 01 02 03
261 // a1: 10 11 12 13
262 // a2: 20 21 22 23
263 // a3: 30 31 32 33
264 // a4: 40 41 42 43
265 // a5: 50 51 52 53
266 // a6: 60 61 62 63
267 // a7: 70 71 72 73
268 // to:
269 // b0.val[0]: 00 10 02 12
270 // b0.val[1]: 01 11 03 13
271 // b1.val[0]: 20 30 22 32
272 // b1.val[1]: 21 31 23 33
273 // b2.val[0]: 40 50 42 52
274 // b2.val[1]: 41 51 43 53
275 // b3.val[0]: 60 70 62 72
276 // b3.val[1]: 61 71 63 73
277
278 int16x4x2_t b0 = vtrn_s16(*a0, *a1);
279 int16x4x2_t b1 = vtrn_s16(*a2, *a3);
280 int16x4x2_t b2 = vtrn_s16(*a4, *a5);
281 int16x4x2_t b3 = vtrn_s16(*a6, *a7);
282
283 // Swap 32 bit elements resulting in:
284 // c0.val[0]: 00 10 20 30
285 // c0.val[1]: 02 12 22 32
286 // c1.val[0]: 01 11 21 31
287 // c1.val[1]: 03 13 23 33
288 // c2.val[0]: 40 50 60 70
289 // c2.val[1]: 42 52 62 72
290 // c3.val[0]: 41 51 61 71
291 // c3.val[1]: 43 53 63 73
292
293 int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
294 vreinterpret_s32_s16(b1.val[0]));
295 int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
296 vreinterpret_s32_s16(b1.val[1]));
297 int32x2x2_t c2 = vtrn_s32(vreinterpret_s32_s16(b2.val[0]),
298 vreinterpret_s32_s16(b3.val[0]));
299 int32x2x2_t c3 = vtrn_s32(vreinterpret_s32_s16(b2.val[1]),
300 vreinterpret_s32_s16(b3.val[1]));
301
302 // Swap 64 bit elements resulting in:
303 // o0: 00 10 20 30 40 50 60 70
304 // o1: 01 11 21 31 41 51 61 71
305 // o2: 02 12 22 32 42 52 62 72
306 // o3: 03 13 23 33 43 53 63 73
307
308 *o0 = vcombine_s16(vreinterpret_s16_s32(c0.val[0]),
309 vreinterpret_s16_s32(c2.val[0]));
310 *o1 = vcombine_s16(vreinterpret_s16_s32(c1.val[0]),
311 vreinterpret_s16_s32(c3.val[0]));
312 *o2 = vcombine_s16(vreinterpret_s16_s32(c0.val[1]),
313 vreinterpret_s16_s32(c2.val[1]));
314 *o3 = vcombine_s16(vreinterpret_s16_s32(c1.val[1]),
315 vreinterpret_s16_s32(c3.val[1]));
316 }
317
transpose_u16_8x8(uint16x8_t * a0,uint16x8_t * a1,uint16x8_t * a2,uint16x8_t * a3,uint16x8_t * a4,uint16x8_t * a5,uint16x8_t * a6,uint16x8_t * a7)318 static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1,
319 uint16x8_t *a2, uint16x8_t *a3,
320 uint16x8_t *a4, uint16x8_t *a5,
321 uint16x8_t *a6, uint16x8_t *a7) {
322 // Swap 16 bit elements. Goes from:
323 // a0: 00 01 02 03 04 05 06 07
324 // a1: 10 11 12 13 14 15 16 17
325 // a2: 20 21 22 23 24 25 26 27
326 // a3: 30 31 32 33 34 35 36 37
327 // a4: 40 41 42 43 44 45 46 47
328 // a5: 50 51 52 53 54 55 56 57
329 // a6: 60 61 62 63 64 65 66 67
330 // a7: 70 71 72 73 74 75 76 77
331 // to:
332 // b0.val[0]: 00 10 02 12 04 14 06 16
333 // b0.val[1]: 01 11 03 13 05 15 07 17
334 // b1.val[0]: 20 30 22 32 24 34 26 36
335 // b1.val[1]: 21 31 23 33 25 35 27 37
336 // b2.val[0]: 40 50 42 52 44 54 46 56
337 // b2.val[1]: 41 51 43 53 45 55 47 57
338 // b3.val[0]: 60 70 62 72 64 74 66 76
339 // b3.val[1]: 61 71 63 73 65 75 67 77
340
341 const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
342 const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
343 const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5);
344 const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7);
345
346 // Swap 32 bit elements resulting in:
347 // c0.val[0]: 00 10 20 30 04 14 24 34
348 // c0.val[1]: 02 12 22 32 06 16 26 36
349 // c1.val[0]: 01 11 21 31 05 15 25 35
350 // c1.val[1]: 03 13 23 33 07 17 27 37
351 // c2.val[0]: 40 50 60 70 44 54 64 74
352 // c2.val[1]: 42 52 62 72 46 56 66 76
353 // c3.val[0]: 41 51 61 71 45 55 65 75
354 // c3.val[1]: 43 53 63 73 47 57 67 77
355
356 const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
357 vreinterpretq_u32_u16(b1.val[0]));
358 const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
359 vreinterpretq_u32_u16(b1.val[1]));
360 const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]),
361 vreinterpretq_u32_u16(b3.val[0]));
362 const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]),
363 vreinterpretq_u32_u16(b3.val[1]));
364
365 *a0 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c0.val[0])),
366 vget_low_u16(vreinterpretq_u16_u32(c2.val[0])));
367 *a4 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c0.val[0])),
368 vget_high_u16(vreinterpretq_u16_u32(c2.val[0])));
369
370 *a2 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c0.val[1])),
371 vget_low_u16(vreinterpretq_u16_u32(c2.val[1])));
372 *a6 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c0.val[1])),
373 vget_high_u16(vreinterpretq_u16_u32(c2.val[1])));
374
375 *a1 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c1.val[0])),
376 vget_low_u16(vreinterpretq_u16_u32(c3.val[0])));
377 *a5 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c1.val[0])),
378 vget_high_u16(vreinterpretq_u16_u32(c3.val[0])));
379
380 *a3 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c1.val[1])),
381 vget_low_u16(vreinterpretq_u16_u32(c3.val[1])));
382 *a7 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c1.val[1])),
383 vget_high_u16(vreinterpretq_u16_u32(c3.val[1])));
384 }
385
transpose_s16_8x8(int16x8_t * a0,int16x8_t * a1,int16x8_t * a2,int16x8_t * a3,int16x8_t * a4,int16x8_t * a5,int16x8_t * a6,int16x8_t * a7)386 static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
387 int16x8_t *a2, int16x8_t *a3,
388 int16x8_t *a4, int16x8_t *a5,
389 int16x8_t *a6, int16x8_t *a7) {
390 // Swap 16 bit elements. Goes from:
391 // a0: 00 01 02 03 04 05 06 07
392 // a1: 10 11 12 13 14 15 16 17
393 // a2: 20 21 22 23 24 25 26 27
394 // a3: 30 31 32 33 34 35 36 37
395 // a4: 40 41 42 43 44 45 46 47
396 // a5: 50 51 52 53 54 55 56 57
397 // a6: 60 61 62 63 64 65 66 67
398 // a7: 70 71 72 73 74 75 76 77
399 // to:
400 // b0.val[0]: 00 10 02 12 04 14 06 16
401 // b0.val[1]: 01 11 03 13 05 15 07 17
402 // b1.val[0]: 20 30 22 32 24 34 26 36
403 // b1.val[1]: 21 31 23 33 25 35 27 37
404 // b2.val[0]: 40 50 42 52 44 54 46 56
405 // b2.val[1]: 41 51 43 53 45 55 47 57
406 // b3.val[0]: 60 70 62 72 64 74 66 76
407 // b3.val[1]: 61 71 63 73 65 75 67 77
408
409 const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
410 const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
411 const int16x8x2_t b2 = vtrnq_s16(*a4, *a5);
412 const int16x8x2_t b3 = vtrnq_s16(*a6, *a7);
413
414 // Swap 32 bit elements resulting in:
415 // c0.val[0]: 00 10 20 30 04 14 24 34
416 // c0.val[1]: 02 12 22 32 06 16 26 36
417 // c1.val[0]: 01 11 21 31 05 15 25 35
418 // c1.val[1]: 03 13 23 33 07 17 27 37
419 // c2.val[0]: 40 50 60 70 44 54 64 74
420 // c2.val[1]: 42 52 62 72 46 56 66 76
421 // c3.val[0]: 41 51 61 71 45 55 65 75
422 // c3.val[1]: 43 53 63 73 47 57 67 77
423
424 const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
425 vreinterpretq_s32_s16(b1.val[0]));
426 const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
427 vreinterpretq_s32_s16(b1.val[1]));
428 const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
429 vreinterpretq_s32_s16(b3.val[0]));
430 const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
431 vreinterpretq_s32_s16(b3.val[1]));
432
433 *a0 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c0.val[0])),
434 vget_low_s16(vreinterpretq_s16_s32(c2.val[0])));
435 *a4 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c0.val[0])),
436 vget_high_s16(vreinterpretq_s16_s32(c2.val[0])));
437
438 *a2 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c0.val[1])),
439 vget_low_s16(vreinterpretq_s16_s32(c2.val[1])));
440 *a6 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c0.val[1])),
441 vget_high_s16(vreinterpretq_s16_s32(c2.val[1])));
442
443 *a1 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c1.val[0])),
444 vget_low_s16(vreinterpretq_s16_s32(c3.val[0])));
445 *a5 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c1.val[0])),
446 vget_high_s16(vreinterpretq_s16_s32(c3.val[0])));
447
448 *a3 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c1.val[1])),
449 vget_low_s16(vreinterpretq_s16_s32(c3.val[1])));
450 *a7 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c1.val[1])),
451 vget_high_s16(vreinterpretq_s16_s32(c3.val[1])));
452 }
453
aom_vtrnq_s64_to_s16(int32x4_t a0,int32x4_t a1)454 static INLINE int16x8x2_t aom_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
455 int16x8x2_t b0;
456 b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
457 vreinterpret_s16_s32(vget_low_s32(a1)));
458 b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
459 vreinterpret_s16_s32(vget_high_s32(a1)));
460 return b0;
461 }
462
transpose_s16_8x8q(int16x8_t * a0,int16x8_t * out)463 static INLINE void transpose_s16_8x8q(int16x8_t *a0, int16x8_t *out) {
464 // Swap 16 bit elements. Goes from:
465 // a0: 00 01 02 03 04 05 06 07
466 // a1: 10 11 12 13 14 15 16 17
467 // a2: 20 21 22 23 24 25 26 27
468 // a3: 30 31 32 33 34 35 36 37
469 // a4: 40 41 42 43 44 45 46 47
470 // a5: 50 51 52 53 54 55 56 57
471 // a6: 60 61 62 63 64 65 66 67
472 // a7: 70 71 72 73 74 75 76 77
473 // to:
474 // b0.val[0]: 00 10 02 12 04 14 06 16
475 // b0.val[1]: 01 11 03 13 05 15 07 17
476 // b1.val[0]: 20 30 22 32 24 34 26 36
477 // b1.val[1]: 21 31 23 33 25 35 27 37
478 // b2.val[0]: 40 50 42 52 44 54 46 56
479 // b2.val[1]: 41 51 43 53 45 55 47 57
480 // b3.val[0]: 60 70 62 72 64 74 66 76
481 // b3.val[1]: 61 71 63 73 65 75 67 77
482
483 const int16x8x2_t b0 = vtrnq_s16(*a0, *(a0 + 1));
484 const int16x8x2_t b1 = vtrnq_s16(*(a0 + 2), *(a0 + 3));
485 const int16x8x2_t b2 = vtrnq_s16(*(a0 + 4), *(a0 + 5));
486 const int16x8x2_t b3 = vtrnq_s16(*(a0 + 6), *(a0 + 7));
487
488 // Swap 32 bit elements resulting in:
489 // c0.val[0]: 00 10 20 30 04 14 24 34
490 // c0.val[1]: 02 12 22 32 06 16 26 36
491 // c1.val[0]: 01 11 21 31 05 15 25 35
492 // c1.val[1]: 03 13 23 33 07 17 27 37
493 // c2.val[0]: 40 50 60 70 44 54 64 74
494 // c2.val[1]: 42 52 62 72 46 56 66 76
495 // c3.val[0]: 41 51 61 71 45 55 65 75
496 // c3.val[1]: 43 53 63 73 47 57 67 77
497
498 const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
499 vreinterpretq_s32_s16(b1.val[0]));
500 const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
501 vreinterpretq_s32_s16(b1.val[1]));
502 const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
503 vreinterpretq_s32_s16(b3.val[0]));
504 const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
505 vreinterpretq_s32_s16(b3.val[1]));
506
507 // Swap 64 bit elements resulting in:
508 // d0.val[0]: 00 10 20 30 40 50 60 70
509 // d0.val[1]: 04 14 24 34 44 54 64 74
510 // d1.val[0]: 01 11 21 31 41 51 61 71
511 // d1.val[1]: 05 15 25 35 45 55 65 75
512 // d2.val[0]: 02 12 22 32 42 52 62 72
513 // d2.val[1]: 06 16 26 36 46 56 66 76
514 // d3.val[0]: 03 13 23 33 43 53 63 73
515 // d3.val[1]: 07 17 27 37 47 57 67 77
516 const int16x8x2_t d0 = aom_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
517 const int16x8x2_t d1 = aom_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
518 const int16x8x2_t d2 = aom_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
519 const int16x8x2_t d3 = aom_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
520
521 *out = d0.val[0];
522 *(out + 1) = d1.val[0];
523 *(out + 2) = d2.val[0];
524 *(out + 3) = d3.val[0];
525 *(out + 4) = d0.val[1];
526 *(out + 5) = d1.val[1];
527 *(out + 6) = d2.val[1];
528 *(out + 7) = d3.val[1];
529 }
530
transpose_s16_4x4d(int16x4_t * a0,int16x4_t * a1,int16x4_t * a2,int16x4_t * a3)531 static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1,
532 int16x4_t *a2, int16x4_t *a3) {
533 // Swap 16 bit elements. Goes from:
534 // a0: 00 01 02 03
535 // a1: 10 11 12 13
536 // a2: 20 21 22 23
537 // a3: 30 31 32 33
538 // to:
539 // b0.val[0]: 00 10 02 12
540 // b0.val[1]: 01 11 03 13
541 // b1.val[0]: 20 30 22 32
542 // b1.val[1]: 21 31 23 33
543
544 const int16x4x2_t b0 = vtrn_s16(*a0, *a1);
545 const int16x4x2_t b1 = vtrn_s16(*a2, *a3);
546
547 // Swap 32 bit elements resulting in:
548 // c0.val[0]: 00 10 20 30
549 // c0.val[1]: 02 12 22 32
550 // c1.val[0]: 01 11 21 31
551 // c1.val[1]: 03 13 23 33
552
553 const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
554 vreinterpret_s32_s16(b1.val[0]));
555 const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
556 vreinterpret_s32_s16(b1.val[1]));
557
558 *a0 = vreinterpret_s16_s32(c0.val[0]);
559 *a1 = vreinterpret_s16_s32(c1.val[0]);
560 *a2 = vreinterpret_s16_s32(c0.val[1]);
561 *a3 = vreinterpret_s16_s32(c1.val[1]);
562 }
563
aom_vtrnq_s64_to_s32(int32x4_t a0,int32x4_t a1)564 static INLINE int32x4x2_t aom_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
565 int32x4x2_t b0;
566 b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
567 b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
568 return b0;
569 }
570
transpose_s32_4x4(int32x4_t * a0,int32x4_t * a1,int32x4_t * a2,int32x4_t * a3)571 static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1,
572 int32x4_t *a2, int32x4_t *a3) {
573 // Swap 32 bit elements. Goes from:
574 // a0: 00 01 02 03
575 // a1: 10 11 12 13
576 // a2: 20 21 22 23
577 // a3: 30 31 32 33
578 // to:
579 // b0.val[0]: 00 10 02 12
580 // b0.val[1]: 01 11 03 13
581 // b1.val[0]: 20 30 22 32
582 // b1.val[1]: 21 31 23 33
583
584 const int32x4x2_t b0 = vtrnq_s32(*a0, *a1);
585 const int32x4x2_t b1 = vtrnq_s32(*a2, *a3);
586
587 // Swap 64 bit elements resulting in:
588 // c0.val[0]: 00 10 20 30
589 // c0.val[1]: 02 12 22 32
590 // c1.val[0]: 01 11 21 31
591 // c1.val[1]: 03 13 23 33
592
593 const int32x4x2_t c0 = aom_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
594 const int32x4x2_t c1 = aom_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
595
596 *a0 = c0.val[0];
597 *a1 = c1.val[0];
598 *a2 = c0.val[1];
599 *a3 = c1.val[1];
600 }
601
602 #endif // AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_
603