1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #ifndef AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
12 #define AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
13
14 #include <arm_neon.h>
15
16 // Swap high and low halves.
transpose64_u16q(const uint16x8_t a)17 static INLINE uint16x8_t transpose64_u16q(const uint16x8_t a) {
18 return vextq_u16(a, a, 4);
19 }
20
transpose_u8_8x8(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3,uint8x8_t * a4,uint8x8_t * a5,uint8x8_t * a6,uint8x8_t * a7)21 static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
22 uint8x8_t *a3, uint8x8_t *a4, uint8x8_t *a5,
23 uint8x8_t *a6, uint8x8_t *a7) {
24 // Swap 8 bit elements. Goes from:
25 // a0: 00 01 02 03 04 05 06 07
26 // a1: 10 11 12 13 14 15 16 17
27 // a2: 20 21 22 23 24 25 26 27
28 // a3: 30 31 32 33 34 35 36 37
29 // a4: 40 41 42 43 44 45 46 47
30 // a5: 50 51 52 53 54 55 56 57
31 // a6: 60 61 62 63 64 65 66 67
32 // a7: 70 71 72 73 74 75 76 77
33 // to:
34 // b0.val[0]: 00 10 02 12 04 14 06 16 40 50 42 52 44 54 46 56
35 // b0.val[1]: 01 11 03 13 05 15 07 17 41 51 43 53 45 55 47 57
36 // b1.val[0]: 20 30 22 32 24 34 26 36 60 70 62 72 64 74 66 76
37 // b1.val[1]: 21 31 23 33 25 35 27 37 61 71 63 73 65 75 67 77
38
39 const uint8x16x2_t b0 =
40 vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5));
41 const uint8x16x2_t b1 =
42 vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7));
43
44 // Swap 16 bit elements resulting in:
45 // c0.val[0]: 00 10 20 30 04 14 24 34 40 50 60 70 44 54 64 74
46 // c0.val[1]: 02 12 22 32 06 16 26 36 42 52 62 72 46 56 66 76
47 // c1.val[0]: 01 11 21 31 05 15 25 35 41 51 61 71 45 55 65 75
48 // c1.val[1]: 03 13 23 33 07 17 27 37 43 53 63 73 47 57 67 77
49
50 const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
51 vreinterpretq_u16_u8(b1.val[0]));
52 const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
53 vreinterpretq_u16_u8(b1.val[1]));
54
55 // Unzip 32 bit elements resulting in:
56 // d0.val[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
57 // d0.val[1]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
58 // d1.val[0]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
59 // d1.val[1]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
60 const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
61 vreinterpretq_u32_u16(c1.val[0]));
62 const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
63 vreinterpretq_u32_u16(c1.val[1]));
64
65 *a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
66 *a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
67 *a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
68 *a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
69 *a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
70 *a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
71 *a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
72 *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
73 }
74
transpose_u8_8x4(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3)75 static INLINE void transpose_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
76 uint8x8_t *a3) {
77 // Swap 8 bit elements. Goes from:
78 // a0: 00 01 02 03 04 05 06 07
79 // a1: 10 11 12 13 14 15 16 17
80 // a2: 20 21 22 23 24 25 26 27
81 // a3: 30 31 32 33 34 35 36 37
82 // to:
83 // b0.val[0]: 00 10 02 12 04 14 06 16
84 // b0.val[1]: 01 11 03 13 05 15 07 17
85 // b1.val[0]: 20 30 22 32 24 34 26 36
86 // b1.val[1]: 21 31 23 33 25 35 27 37
87
88 const uint8x8x2_t b0 = vtrn_u8(*a0, *a1);
89 const uint8x8x2_t b1 = vtrn_u8(*a2, *a3);
90
91 // Swap 16 bit elements resulting in:
92 // c0.val[0]: 00 10 20 30 04 14 24 34
93 // c0.val[1]: 02 12 22 32 06 16 26 36
94 // c1.val[0]: 01 11 21 31 05 15 25 35
95 // c1.val[1]: 03 13 23 33 07 17 27 37
96
97 const uint16x4x2_t c0 =
98 vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0]));
99 const uint16x4x2_t c1 =
100 vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1]));
101
102 *a0 = vreinterpret_u8_u16(c0.val[0]);
103 *a1 = vreinterpret_u8_u16(c1.val[0]);
104 *a2 = vreinterpret_u8_u16(c0.val[1]);
105 *a3 = vreinterpret_u8_u16(c1.val[1]);
106 }
107
transpose_u8_4x4(uint8x8_t * a0,uint8x8_t * a1)108 static INLINE void transpose_u8_4x4(uint8x8_t *a0, uint8x8_t *a1) {
109 // Swap 16 bit elements. Goes from:
110 // a0: 00 01 02 03 10 11 12 13
111 // a1: 20 21 22 23 30 31 32 33
112 // to:
113 // b0.val[0]: 00 01 20 21 10 11 30 31
114 // b0.val[1]: 02 03 22 23 12 13 32 33
115
116 const uint16x4x2_t b0 =
117 vtrn_u16(vreinterpret_u16_u8(*a0), vreinterpret_u16_u8(*a1));
118
119 // Swap 32 bit elements resulting in:
120 // c0.val[0]: 00 01 20 21 02 03 22 23
121 // c0.val[1]: 10 11 30 31 12 13 32 33
122
123 const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
124 vreinterpret_u32_u16(b0.val[1]));
125
126 // Swap 8 bit elements resulting in:
127 // d0.val[0]: 00 10 20 30 02 12 22 32
128 // d0.val[1]: 01 11 21 31 03 13 23 33
129
130 const uint8x8x2_t d0 =
131 vtrn_u8(vreinterpret_u8_u32(c0.val[0]), vreinterpret_u8_u32(c0.val[1]));
132
133 *a0 = d0.val[0];
134 *a1 = d0.val[1];
135 }
136
transpose_u8_4x8(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3,const uint8x8_t a4,const uint8x8_t a5,const uint8x8_t a6,const uint8x8_t a7)137 static INLINE void transpose_u8_4x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
138 uint8x8_t *a3, const uint8x8_t a4,
139 const uint8x8_t a5, const uint8x8_t a6,
140 const uint8x8_t a7) {
141 // Swap 32 bit elements. Goes from:
142 // a0: 00 01 02 03 XX XX XX XX
143 // a1: 10 11 12 13 XX XX XX XX
144 // a2: 20 21 22 23 XX XX XX XX
145 // a3; 30 31 32 33 XX XX XX XX
146 // a4: 40 41 42 43 XX XX XX XX
147 // a5: 50 51 52 53 XX XX XX XX
148 // a6: 60 61 62 63 XX XX XX XX
149 // a7: 70 71 72 73 XX XX XX XX
150 // to:
151 // b0.val[0]: 00 01 02 03 40 41 42 43
152 // b1.val[0]: 10 11 12 13 50 51 52 53
153 // b2.val[0]: 20 21 22 23 60 61 62 63
154 // b3.val[0]: 30 31 32 33 70 71 72 73
155
156 const uint32x2x2_t b0 =
157 vtrn_u32(vreinterpret_u32_u8(*a0), vreinterpret_u32_u8(a4));
158 const uint32x2x2_t b1 =
159 vtrn_u32(vreinterpret_u32_u8(*a1), vreinterpret_u32_u8(a5));
160 const uint32x2x2_t b2 =
161 vtrn_u32(vreinterpret_u32_u8(*a2), vreinterpret_u32_u8(a6));
162 const uint32x2x2_t b3 =
163 vtrn_u32(vreinterpret_u32_u8(*a3), vreinterpret_u32_u8(a7));
164
165 // Swap 16 bit elements resulting in:
166 // c0.val[0]: 00 01 20 21 40 41 60 61
167 // c0.val[1]: 02 03 22 23 42 43 62 63
168 // c1.val[0]: 10 11 30 31 50 51 70 71
169 // c1.val[1]: 12 13 32 33 52 53 72 73
170
171 const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]),
172 vreinterpret_u16_u32(b2.val[0]));
173 const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]),
174 vreinterpret_u16_u32(b3.val[0]));
175
176 // Swap 8 bit elements resulting in:
177 // d0.val[0]: 00 10 20 30 40 50 60 70
178 // d0.val[1]: 01 11 21 31 41 51 61 71
179 // d1.val[0]: 02 12 22 32 42 52 62 72
180 // d1.val[1]: 03 13 23 33 43 53 63 73
181
182 const uint8x8x2_t d0 =
183 vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0]));
184 const uint8x8x2_t d1 =
185 vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1]));
186
187 *a0 = d0.val[0];
188 *a1 = d0.val[1];
189 *a2 = d1.val[0];
190 *a3 = d1.val[1];
191 }
192
193 // Input:
194 // 00 01 02 03
195 // 10 11 12 13
196 // 20 21 22 23
197 // 30 31 32 33
198 // Output:
199 // 00 10 20 30
200 // 01 11 21 31
201 // 02 12 22 32
202 // 03 13 23 33
transpose_u16_4x4(uint16x4_t a[4])203 static INLINE void transpose_u16_4x4(uint16x4_t a[4]) {
204 // b:
205 // 00 10 02 12
206 // 01 11 03 13
207 const uint16x4x2_t b = vtrn_u16(a[0], a[1]);
208 // c:
209 // 20 30 22 32
210 // 21 31 23 33
211 const uint16x4x2_t c = vtrn_u16(a[2], a[3]);
212 // d:
213 // 00 10 20 30
214 // 02 12 22 32
215 const uint32x2x2_t d =
216 vtrn_u32(vreinterpret_u32_u16(b.val[0]), vreinterpret_u32_u16(c.val[0]));
217 // e:
218 // 01 11 21 31
219 // 03 13 23 33
220 const uint32x2x2_t e =
221 vtrn_u32(vreinterpret_u32_u16(b.val[1]), vreinterpret_u32_u16(c.val[1]));
222 a[0] = vreinterpret_u16_u32(d.val[0]);
223 a[1] = vreinterpret_u16_u32(e.val[0]);
224 a[2] = vreinterpret_u16_u32(d.val[1]);
225 a[3] = vreinterpret_u16_u32(e.val[1]);
226 }
227
228 // 4x8 Input:
229 // a[0]: 00 01 02 03 04 05 06 07
230 // a[1]: 10 11 12 13 14 15 16 17
231 // a[2]: 20 21 22 23 24 25 26 27
232 // a[3]: 30 31 32 33 34 35 36 37
233 // 8x4 Output:
234 // a[0]: 00 10 20 30 04 14 24 34
235 // a[1]: 01 11 21 31 05 15 25 35
236 // a[2]: 02 12 22 32 06 16 26 36
237 // a[3]: 03 13 23 33 07 17 27 37
transpose_u16_4x8q(uint16x8_t a[4])238 static INLINE void transpose_u16_4x8q(uint16x8_t a[4]) {
239 // b0.val[0]: 00 10 02 12 04 14 06 16
240 // b0.val[1]: 01 11 03 13 05 15 07 17
241 // b1.val[0]: 20 30 22 32 24 34 26 36
242 // b1.val[1]: 21 31 23 33 25 35 27 37
243 const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
244 const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
245
246 // c0.val[0]: 00 10 20 30 04 14 24 34
247 // c0.val[1]: 02 12 22 32 06 16 26 36
248 // c1.val[0]: 01 11 21 31 05 15 25 35
249 // c1.val[1]: 03 13 23 33 07 17 27 37
250 const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
251 vreinterpretq_u32_u16(b1.val[0]));
252 const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
253 vreinterpretq_u32_u16(b1.val[1]));
254
255 a[0] = vreinterpretq_u16_u32(c0.val[0]);
256 a[1] = vreinterpretq_u16_u32(c1.val[0]);
257 a[2] = vreinterpretq_u16_u32(c0.val[1]);
258 a[3] = vreinterpretq_u16_u32(c1.val[1]);
259 }
260
aom_vtrnq_u64_to_u16(const uint32x4_t a0,const uint32x4_t a1)261 static INLINE uint16x8x2_t aom_vtrnq_u64_to_u16(const uint32x4_t a0,
262 const uint32x4_t a1) {
263 uint16x8x2_t b0;
264 b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
265 vreinterpret_u16_u32(vget_low_u32(a1)));
266 b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
267 vreinterpret_u16_u32(vget_high_u32(a1)));
268 return b0;
269 }
270
271 // Special transpose for loop filter.
272 // 4x8 Input:
273 // p_q: p3 p2 p1 p0 q0 q1 q2 q3
274 // a[0]: 00 01 02 03 04 05 06 07
275 // a[1]: 10 11 12 13 14 15 16 17
276 // a[2]: 20 21 22 23 24 25 26 27
277 // a[3]: 30 31 32 33 34 35 36 37
278 // 8x4 Output:
279 // a[0]: 03 13 23 33 04 14 24 34 p0q0
280 // a[1]: 02 12 22 32 05 15 25 35 p1q1
281 // a[2]: 01 11 21 31 06 16 26 36 p2q2
282 // a[3]: 00 10 20 30 07 17 27 37 p3q3
283 // Direct reapplication of the function will reset the high halves, but
284 // reverse the low halves:
285 // p_q: p0 p1 p2 p3 q0 q1 q2 q3
286 // a[0]: 33 32 31 30 04 05 06 07
287 // a[1]: 23 22 21 20 14 15 16 17
288 // a[2]: 13 12 11 10 24 25 26 27
289 // a[3]: 03 02 01 00 34 35 36 37
290 // Simply reordering the inputs (3, 2, 1, 0) will reset the low halves, but
291 // reverse the high halves.
292 // The standard transpose_u16_4x8q will produce the same reversals, but with the
293 // order of the low halves also restored relative to the high halves. This is
294 // preferable because it puts all values from the same source row back together,
295 // but some post-processing is inevitable.
loop_filter_transpose_u16_4x8q(uint16x8_t a[4])296 static INLINE void loop_filter_transpose_u16_4x8q(uint16x8_t a[4]) {
297 // b0.val[0]: 00 10 02 12 04 14 06 16
298 // b0.val[1]: 01 11 03 13 05 15 07 17
299 // b1.val[0]: 20 30 22 32 24 34 26 36
300 // b1.val[1]: 21 31 23 33 25 35 27 37
301 const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
302 const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
303
304 // Reverse odd vectors to bring the appropriate items to the front of zips.
305 // b0.val[0]: 00 10 02 12 04 14 06 16
306 // r0 : 03 13 01 11 07 17 05 15
307 // b1.val[0]: 20 30 22 32 24 34 26 36
308 // r1 : 23 33 21 31 27 37 25 35
309 const uint32x4_t r0 = vrev64q_u32(vreinterpretq_u32_u16(b0.val[1]));
310 const uint32x4_t r1 = vrev64q_u32(vreinterpretq_u32_u16(b1.val[1]));
311
312 // Zip to complete the halves.
313 // c0.val[0]: 00 10 20 30 02 12 22 32 p3p1
314 // c0.val[1]: 04 14 24 34 06 16 26 36 q0q2
315 // c1.val[0]: 03 13 23 33 01 11 21 31 p0p2
316 // c1.val[1]: 07 17 27 37 05 15 25 35 q3q1
317 const uint32x4x2_t c0 = vzipq_u32(vreinterpretq_u32_u16(b0.val[0]),
318 vreinterpretq_u32_u16(b1.val[0]));
319 const uint32x4x2_t c1 = vzipq_u32(r0, r1);
320
321 // d0.val[0]: 00 10 20 30 07 17 27 37 p3q3
322 // d0.val[1]: 02 12 22 32 05 15 25 35 p1q1
323 // d1.val[0]: 03 13 23 33 04 14 24 34 p0q0
324 // d1.val[1]: 01 11 21 31 06 16 26 36 p2q2
325 const uint16x8x2_t d0 = aom_vtrnq_u64_to_u16(c0.val[0], c1.val[1]);
326 // The third row of c comes first here to swap p2 with q0.
327 const uint16x8x2_t d1 = aom_vtrnq_u64_to_u16(c1.val[0], c0.val[1]);
328
329 // 8x4 Output:
330 // a[0]: 03 13 23 33 04 14 24 34 p0q0
331 // a[1]: 02 12 22 32 05 15 25 35 p1q1
332 // a[2]: 01 11 21 31 06 16 26 36 p2q2
333 // a[3]: 00 10 20 30 07 17 27 37 p3q3
334 a[0] = d1.val[0]; // p0q0
335 a[1] = d0.val[1]; // p1q1
336 a[2] = d1.val[1]; // p2q2
337 a[3] = d0.val[0]; // p3q3
338 }
339
transpose_u16_4x8(uint16x4_t * a0,uint16x4_t * a1,uint16x4_t * a2,uint16x4_t * a3,uint16x4_t * a4,uint16x4_t * a5,uint16x4_t * a6,uint16x4_t * a7,uint16x8_t * o0,uint16x8_t * o1,uint16x8_t * o2,uint16x8_t * o3)340 static INLINE void transpose_u16_4x8(uint16x4_t *a0, uint16x4_t *a1,
341 uint16x4_t *a2, uint16x4_t *a3,
342 uint16x4_t *a4, uint16x4_t *a5,
343 uint16x4_t *a6, uint16x4_t *a7,
344 uint16x8_t *o0, uint16x8_t *o1,
345 uint16x8_t *o2, uint16x8_t *o3) {
346 // Swap 16 bit elements. Goes from:
347 // a0: 00 01 02 03
348 // a1: 10 11 12 13
349 // a2: 20 21 22 23
350 // a3: 30 31 32 33
351 // a4: 40 41 42 43
352 // a5: 50 51 52 53
353 // a6: 60 61 62 63
354 // a7: 70 71 72 73
355 // to:
356 // b0.val[0]: 00 10 02 12
357 // b0.val[1]: 01 11 03 13
358 // b1.val[0]: 20 30 22 32
359 // b1.val[1]: 21 31 23 33
360 // b2.val[0]: 40 50 42 52
361 // b2.val[1]: 41 51 43 53
362 // b3.val[0]: 60 70 62 72
363 // b3.val[1]: 61 71 63 73
364
365 uint16x4x2_t b0 = vtrn_u16(*a0, *a1);
366 uint16x4x2_t b1 = vtrn_u16(*a2, *a3);
367 uint16x4x2_t b2 = vtrn_u16(*a4, *a5);
368 uint16x4x2_t b3 = vtrn_u16(*a6, *a7);
369
370 // Swap 32 bit elements resulting in:
371 // c0.val[0]: 00 10 20 30
372 // c0.val[1]: 02 12 22 32
373 // c1.val[0]: 01 11 21 31
374 // c1.val[1]: 03 13 23 33
375 // c2.val[0]: 40 50 60 70
376 // c2.val[1]: 42 52 62 72
377 // c3.val[0]: 41 51 61 71
378 // c3.val[1]: 43 53 63 73
379
380 uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
381 vreinterpret_u32_u16(b1.val[0]));
382 uint32x2x2_t c1 = vtrn_u32(vreinterpret_u32_u16(b0.val[1]),
383 vreinterpret_u32_u16(b1.val[1]));
384 uint32x2x2_t c2 = vtrn_u32(vreinterpret_u32_u16(b2.val[0]),
385 vreinterpret_u32_u16(b3.val[0]));
386 uint32x2x2_t c3 = vtrn_u32(vreinterpret_u32_u16(b2.val[1]),
387 vreinterpret_u32_u16(b3.val[1]));
388
389 // Swap 64 bit elements resulting in:
390 // o0: 00 10 20 30 40 50 60 70
391 // o1: 01 11 21 31 41 51 61 71
392 // o2: 02 12 22 32 42 52 62 72
393 // o3: 03 13 23 33 43 53 63 73
394
395 *o0 = vcombine_u16(vreinterpret_u16_u32(c0.val[0]),
396 vreinterpret_u16_u32(c2.val[0]));
397 *o1 = vcombine_u16(vreinterpret_u16_u32(c1.val[0]),
398 vreinterpret_u16_u32(c3.val[0]));
399 *o2 = vcombine_u16(vreinterpret_u16_u32(c0.val[1]),
400 vreinterpret_u16_u32(c2.val[1]));
401 *o3 = vcombine_u16(vreinterpret_u16_u32(c1.val[1]),
402 vreinterpret_u16_u32(c3.val[1]));
403 }
404
transpose_s16_4x8(int16x4_t * a0,int16x4_t * a1,int16x4_t * a2,int16x4_t * a3,int16x4_t * a4,int16x4_t * a5,int16x4_t * a6,int16x4_t * a7,int16x8_t * o0,int16x8_t * o1,int16x8_t * o2,int16x8_t * o3)405 static INLINE void transpose_s16_4x8(int16x4_t *a0, int16x4_t *a1,
406 int16x4_t *a2, int16x4_t *a3,
407 int16x4_t *a4, int16x4_t *a5,
408 int16x4_t *a6, int16x4_t *a7,
409 int16x8_t *o0, int16x8_t *o1,
410 int16x8_t *o2, int16x8_t *o3) {
411 // Swap 16 bit elements. Goes from:
412 // a0: 00 01 02 03
413 // a1: 10 11 12 13
414 // a2: 20 21 22 23
415 // a3: 30 31 32 33
416 // a4: 40 41 42 43
417 // a5: 50 51 52 53
418 // a6: 60 61 62 63
419 // a7: 70 71 72 73
420 // to:
421 // b0.val[0]: 00 10 02 12
422 // b0.val[1]: 01 11 03 13
423 // b1.val[0]: 20 30 22 32
424 // b1.val[1]: 21 31 23 33
425 // b2.val[0]: 40 50 42 52
426 // b2.val[1]: 41 51 43 53
427 // b3.val[0]: 60 70 62 72
428 // b3.val[1]: 61 71 63 73
429
430 int16x4x2_t b0 = vtrn_s16(*a0, *a1);
431 int16x4x2_t b1 = vtrn_s16(*a2, *a3);
432 int16x4x2_t b2 = vtrn_s16(*a4, *a5);
433 int16x4x2_t b3 = vtrn_s16(*a6, *a7);
434
435 // Swap 32 bit elements resulting in:
436 // c0.val[0]: 00 10 20 30
437 // c0.val[1]: 02 12 22 32
438 // c1.val[0]: 01 11 21 31
439 // c1.val[1]: 03 13 23 33
440 // c2.val[0]: 40 50 60 70
441 // c2.val[1]: 42 52 62 72
442 // c3.val[0]: 41 51 61 71
443 // c3.val[1]: 43 53 63 73
444
445 int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
446 vreinterpret_s32_s16(b1.val[0]));
447 int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
448 vreinterpret_s32_s16(b1.val[1]));
449 int32x2x2_t c2 = vtrn_s32(vreinterpret_s32_s16(b2.val[0]),
450 vreinterpret_s32_s16(b3.val[0]));
451 int32x2x2_t c3 = vtrn_s32(vreinterpret_s32_s16(b2.val[1]),
452 vreinterpret_s32_s16(b3.val[1]));
453
454 // Swap 64 bit elements resulting in:
455 // o0: 00 10 20 30 40 50 60 70
456 // o1: 01 11 21 31 41 51 61 71
457 // o2: 02 12 22 32 42 52 62 72
458 // o3: 03 13 23 33 43 53 63 73
459
460 *o0 = vcombine_s16(vreinterpret_s16_s32(c0.val[0]),
461 vreinterpret_s16_s32(c2.val[0]));
462 *o1 = vcombine_s16(vreinterpret_s16_s32(c1.val[0]),
463 vreinterpret_s16_s32(c3.val[0]));
464 *o2 = vcombine_s16(vreinterpret_s16_s32(c0.val[1]),
465 vreinterpret_s16_s32(c2.val[1]));
466 *o3 = vcombine_s16(vreinterpret_s16_s32(c1.val[1]),
467 vreinterpret_s16_s32(c3.val[1]));
468 }
469
transpose_u16_8x8(uint16x8_t * a0,uint16x8_t * a1,uint16x8_t * a2,uint16x8_t * a3,uint16x8_t * a4,uint16x8_t * a5,uint16x8_t * a6,uint16x8_t * a7)470 static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1,
471 uint16x8_t *a2, uint16x8_t *a3,
472 uint16x8_t *a4, uint16x8_t *a5,
473 uint16x8_t *a6, uint16x8_t *a7) {
474 // Swap 16 bit elements. Goes from:
475 // a0: 00 01 02 03 04 05 06 07
476 // a1: 10 11 12 13 14 15 16 17
477 // a2: 20 21 22 23 24 25 26 27
478 // a3: 30 31 32 33 34 35 36 37
479 // a4: 40 41 42 43 44 45 46 47
480 // a5: 50 51 52 53 54 55 56 57
481 // a6: 60 61 62 63 64 65 66 67
482 // a7: 70 71 72 73 74 75 76 77
483 // to:
484 // b0.val[0]: 00 10 02 12 04 14 06 16
485 // b0.val[1]: 01 11 03 13 05 15 07 17
486 // b1.val[0]: 20 30 22 32 24 34 26 36
487 // b1.val[1]: 21 31 23 33 25 35 27 37
488 // b2.val[0]: 40 50 42 52 44 54 46 56
489 // b2.val[1]: 41 51 43 53 45 55 47 57
490 // b3.val[0]: 60 70 62 72 64 74 66 76
491 // b3.val[1]: 61 71 63 73 65 75 67 77
492
493 const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
494 const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
495 const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5);
496 const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7);
497
498 // Swap 32 bit elements resulting in:
499 // c0.val[0]: 00 10 20 30 04 14 24 34
500 // c0.val[1]: 02 12 22 32 06 16 26 36
501 // c1.val[0]: 01 11 21 31 05 15 25 35
502 // c1.val[1]: 03 13 23 33 07 17 27 37
503 // c2.val[0]: 40 50 60 70 44 54 64 74
504 // c2.val[1]: 42 52 62 72 46 56 66 76
505 // c3.val[0]: 41 51 61 71 45 55 65 75
506 // c3.val[1]: 43 53 63 73 47 57 67 77
507
508 const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
509 vreinterpretq_u32_u16(b1.val[0]));
510 const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
511 vreinterpretq_u32_u16(b1.val[1]));
512 const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]),
513 vreinterpretq_u32_u16(b3.val[0]));
514 const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]),
515 vreinterpretq_u32_u16(b3.val[1]));
516
517 *a0 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c0.val[0])),
518 vget_low_u16(vreinterpretq_u16_u32(c2.val[0])));
519 *a4 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c0.val[0])),
520 vget_high_u16(vreinterpretq_u16_u32(c2.val[0])));
521
522 *a2 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c0.val[1])),
523 vget_low_u16(vreinterpretq_u16_u32(c2.val[1])));
524 *a6 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c0.val[1])),
525 vget_high_u16(vreinterpretq_u16_u32(c2.val[1])));
526
527 *a1 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c1.val[0])),
528 vget_low_u16(vreinterpretq_u16_u32(c3.val[0])));
529 *a5 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c1.val[0])),
530 vget_high_u16(vreinterpretq_u16_u32(c3.val[0])));
531
532 *a3 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c1.val[1])),
533 vget_low_u16(vreinterpretq_u16_u32(c3.val[1])));
534 *a7 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c1.val[1])),
535 vget_high_u16(vreinterpretq_u16_u32(c3.val[1])));
536 }
537
transpose_s16_8x8(int16x8_t * a0,int16x8_t * a1,int16x8_t * a2,int16x8_t * a3,int16x8_t * a4,int16x8_t * a5,int16x8_t * a6,int16x8_t * a7)538 static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
539 int16x8_t *a2, int16x8_t *a3,
540 int16x8_t *a4, int16x8_t *a5,
541 int16x8_t *a6, int16x8_t *a7) {
542 // Swap 16 bit elements. Goes from:
543 // a0: 00 01 02 03 04 05 06 07
544 // a1: 10 11 12 13 14 15 16 17
545 // a2: 20 21 22 23 24 25 26 27
546 // a3: 30 31 32 33 34 35 36 37
547 // a4: 40 41 42 43 44 45 46 47
548 // a5: 50 51 52 53 54 55 56 57
549 // a6: 60 61 62 63 64 65 66 67
550 // a7: 70 71 72 73 74 75 76 77
551 // to:
552 // b0.val[0]: 00 10 02 12 04 14 06 16
553 // b0.val[1]: 01 11 03 13 05 15 07 17
554 // b1.val[0]: 20 30 22 32 24 34 26 36
555 // b1.val[1]: 21 31 23 33 25 35 27 37
556 // b2.val[0]: 40 50 42 52 44 54 46 56
557 // b2.val[1]: 41 51 43 53 45 55 47 57
558 // b3.val[0]: 60 70 62 72 64 74 66 76
559 // b3.val[1]: 61 71 63 73 65 75 67 77
560
561 const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
562 const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
563 const int16x8x2_t b2 = vtrnq_s16(*a4, *a5);
564 const int16x8x2_t b3 = vtrnq_s16(*a6, *a7);
565
566 // Swap 32 bit elements resulting in:
567 // c0.val[0]: 00 10 20 30 04 14 24 34
568 // c0.val[1]: 02 12 22 32 06 16 26 36
569 // c1.val[0]: 01 11 21 31 05 15 25 35
570 // c1.val[1]: 03 13 23 33 07 17 27 37
571 // c2.val[0]: 40 50 60 70 44 54 64 74
572 // c2.val[1]: 42 52 62 72 46 56 66 76
573 // c3.val[0]: 41 51 61 71 45 55 65 75
574 // c3.val[1]: 43 53 63 73 47 57 67 77
575
576 const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
577 vreinterpretq_s32_s16(b1.val[0]));
578 const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
579 vreinterpretq_s32_s16(b1.val[1]));
580 const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
581 vreinterpretq_s32_s16(b3.val[0]));
582 const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
583 vreinterpretq_s32_s16(b3.val[1]));
584
585 *a0 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c0.val[0])),
586 vget_low_s16(vreinterpretq_s16_s32(c2.val[0])));
587 *a4 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c0.val[0])),
588 vget_high_s16(vreinterpretq_s16_s32(c2.val[0])));
589
590 *a2 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c0.val[1])),
591 vget_low_s16(vreinterpretq_s16_s32(c2.val[1])));
592 *a6 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c0.val[1])),
593 vget_high_s16(vreinterpretq_s16_s32(c2.val[1])));
594
595 *a1 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c1.val[0])),
596 vget_low_s16(vreinterpretq_s16_s32(c3.val[0])));
597 *a5 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c1.val[0])),
598 vget_high_s16(vreinterpretq_s16_s32(c3.val[0])));
599
600 *a3 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c1.val[1])),
601 vget_low_s16(vreinterpretq_s16_s32(c3.val[1])));
602 *a7 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c1.val[1])),
603 vget_high_s16(vreinterpretq_s16_s32(c3.val[1])));
604 }
605
aom_vtrnq_s64_to_s16(int32x4_t a0,int32x4_t a1)606 static INLINE int16x8x2_t aom_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
607 int16x8x2_t b0;
608 b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
609 vreinterpret_s16_s32(vget_low_s32(a1)));
610 b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
611 vreinterpret_s16_s32(vget_high_s32(a1)));
612 return b0;
613 }
614
transpose_s16_8x8q(int16x8_t * a0,int16x8_t * out)615 static INLINE void transpose_s16_8x8q(int16x8_t *a0, int16x8_t *out) {
616 // Swap 16 bit elements. Goes from:
617 // a0: 00 01 02 03 04 05 06 07
618 // a1: 10 11 12 13 14 15 16 17
619 // a2: 20 21 22 23 24 25 26 27
620 // a3: 30 31 32 33 34 35 36 37
621 // a4: 40 41 42 43 44 45 46 47
622 // a5: 50 51 52 53 54 55 56 57
623 // a6: 60 61 62 63 64 65 66 67
624 // a7: 70 71 72 73 74 75 76 77
625 // to:
626 // b0.val[0]: 00 10 02 12 04 14 06 16
627 // b0.val[1]: 01 11 03 13 05 15 07 17
628 // b1.val[0]: 20 30 22 32 24 34 26 36
629 // b1.val[1]: 21 31 23 33 25 35 27 37
630 // b2.val[0]: 40 50 42 52 44 54 46 56
631 // b2.val[1]: 41 51 43 53 45 55 47 57
632 // b3.val[0]: 60 70 62 72 64 74 66 76
633 // b3.val[1]: 61 71 63 73 65 75 67 77
634
635 const int16x8x2_t b0 = vtrnq_s16(*a0, *(a0 + 1));
636 const int16x8x2_t b1 = vtrnq_s16(*(a0 + 2), *(a0 + 3));
637 const int16x8x2_t b2 = vtrnq_s16(*(a0 + 4), *(a0 + 5));
638 const int16x8x2_t b3 = vtrnq_s16(*(a0 + 6), *(a0 + 7));
639
640 // Swap 32 bit elements resulting in:
641 // c0.val[0]: 00 10 20 30 04 14 24 34
642 // c0.val[1]: 02 12 22 32 06 16 26 36
643 // c1.val[0]: 01 11 21 31 05 15 25 35
644 // c1.val[1]: 03 13 23 33 07 17 27 37
645 // c2.val[0]: 40 50 60 70 44 54 64 74
646 // c2.val[1]: 42 52 62 72 46 56 66 76
647 // c3.val[0]: 41 51 61 71 45 55 65 75
648 // c3.val[1]: 43 53 63 73 47 57 67 77
649
650 const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
651 vreinterpretq_s32_s16(b1.val[0]));
652 const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
653 vreinterpretq_s32_s16(b1.val[1]));
654 const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
655 vreinterpretq_s32_s16(b3.val[0]));
656 const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
657 vreinterpretq_s32_s16(b3.val[1]));
658
659 // Swap 64 bit elements resulting in:
660 // d0.val[0]: 00 10 20 30 40 50 60 70
661 // d0.val[1]: 04 14 24 34 44 54 64 74
662 // d1.val[0]: 01 11 21 31 41 51 61 71
663 // d1.val[1]: 05 15 25 35 45 55 65 75
664 // d2.val[0]: 02 12 22 32 42 52 62 72
665 // d2.val[1]: 06 16 26 36 46 56 66 76
666 // d3.val[0]: 03 13 23 33 43 53 63 73
667 // d3.val[1]: 07 17 27 37 47 57 67 77
668 const int16x8x2_t d0 = aom_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
669 const int16x8x2_t d1 = aom_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
670 const int16x8x2_t d2 = aom_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
671 const int16x8x2_t d3 = aom_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
672
673 *out = d0.val[0];
674 *(out + 1) = d1.val[0];
675 *(out + 2) = d2.val[0];
676 *(out + 3) = d3.val[0];
677 *(out + 4) = d0.val[1];
678 *(out + 5) = d1.val[1];
679 *(out + 6) = d2.val[1];
680 *(out + 7) = d3.val[1];
681 }
682
transpose_s16_4x4d(int16x4_t * a0,int16x4_t * a1,int16x4_t * a2,int16x4_t * a3)683 static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1,
684 int16x4_t *a2, int16x4_t *a3) {
685 // Swap 16 bit elements. Goes from:
686 // a0: 00 01 02 03
687 // a1: 10 11 12 13
688 // a2: 20 21 22 23
689 // a3: 30 31 32 33
690 // to:
691 // b0.val[0]: 00 10 02 12
692 // b0.val[1]: 01 11 03 13
693 // b1.val[0]: 20 30 22 32
694 // b1.val[1]: 21 31 23 33
695
696 const int16x4x2_t b0 = vtrn_s16(*a0, *a1);
697 const int16x4x2_t b1 = vtrn_s16(*a2, *a3);
698
699 // Swap 32 bit elements resulting in:
700 // c0.val[0]: 00 10 20 30
701 // c0.val[1]: 02 12 22 32
702 // c1.val[0]: 01 11 21 31
703 // c1.val[1]: 03 13 23 33
704
705 const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
706 vreinterpret_s32_s16(b1.val[0]));
707 const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
708 vreinterpret_s32_s16(b1.val[1]));
709
710 *a0 = vreinterpret_s16_s32(c0.val[0]);
711 *a1 = vreinterpret_s16_s32(c1.val[0]);
712 *a2 = vreinterpret_s16_s32(c0.val[1]);
713 *a3 = vreinterpret_s16_s32(c1.val[1]);
714 }
715
aom_vtrnq_s64_to_s32(int32x4_t a0,int32x4_t a1)716 static INLINE int32x4x2_t aom_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
717 int32x4x2_t b0;
718 b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
719 b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
720 return b0;
721 }
722
transpose_s32_4x4(int32x4_t * a0,int32x4_t * a1,int32x4_t * a2,int32x4_t * a3)723 static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1,
724 int32x4_t *a2, int32x4_t *a3) {
725 // Swap 32 bit elements. Goes from:
726 // a0: 00 01 02 03
727 // a1: 10 11 12 13
728 // a2: 20 21 22 23
729 // a3: 30 31 32 33
730 // to:
731 // b0.val[0]: 00 10 02 12
732 // b0.val[1]: 01 11 03 13
733 // b1.val[0]: 20 30 22 32
734 // b1.val[1]: 21 31 23 33
735
736 const int32x4x2_t b0 = vtrnq_s32(*a0, *a1);
737 const int32x4x2_t b1 = vtrnq_s32(*a2, *a3);
738
739 // Swap 64 bit elements resulting in:
740 // c0.val[0]: 00 10 20 30
741 // c0.val[1]: 02 12 22 32
742 // c1.val[0]: 01 11 21 31
743 // c1.val[1]: 03 13 23 33
744
745 const int32x4x2_t c0 = aom_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
746 const int32x4x2_t c1 = aom_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
747
748 *a0 = c0.val[0];
749 *a1 = c1.val[0];
750 *a2 = c0.val[1];
751 *a3 = c1.val[1];
752 }
753
754 #endif // AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
755