1 /*
2 * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #ifndef VPX_VPX_DSP_ARM_TRANSPOSE_NEON_H_
12 #define VPX_VPX_DSP_ARM_TRANSPOSE_NEON_H_
13
14 #include <arm_neon.h>
15
16 #include "./vpx_config.h"
17
18 // Transpose 64 bit elements as follows:
19 // a0: 00 01 02 03 04 05 06 07
20 // a1: 16 17 18 19 20 21 22 23
21 //
22 // b0.val[0]: 00 01 02 03 16 17 18 19
23 // b0.val[1]: 04 05 06 07 20 21 22 23
vpx_vtrnq_s64_to_s16(int32x4_t a0,int32x4_t a1)24 static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
25 int16x8x2_t b0;
26 #if VPX_ARCH_AARCH64
27 b0.val[0] = vreinterpretq_s16_s64(
28 vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
29 b0.val[1] = vreinterpretq_s16_s64(
30 vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
31 #else
32 b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
33 vreinterpret_s16_s32(vget_low_s32(a1)));
34 b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
35 vreinterpret_s16_s32(vget_high_s32(a1)));
36 #endif
37 return b0;
38 }
39
vpx_vtrnq_s64_to_s32(int32x4_t a0,int32x4_t a1)40 static INLINE int32x4x2_t vpx_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
41 int32x4x2_t b0;
42 #if VPX_ARCH_AARCH64
43 b0.val[0] = vreinterpretq_s32_s64(
44 vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
45 b0.val[1] = vreinterpretq_s32_s64(
46 vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
47 #else
48 b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
49 b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
50 #endif
51 return b0;
52 }
53
vpx_vtrnq_s64(int32x4_t a0,int32x4_t a1)54 static INLINE int64x2x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) {
55 int64x2x2_t b0;
56 #if VPX_ARCH_AARCH64
57 b0.val[0] = vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1));
58 b0.val[1] = vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1));
59 #else
60 b0.val[0] = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(a0)),
61 vreinterpret_s64_s32(vget_low_s32(a1)));
62 b0.val[1] = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(a0)),
63 vreinterpret_s64_s32(vget_high_s32(a1)));
64 #endif
65 return b0;
66 }
67
vpx_vtrnq_u64_to_u8(uint32x4_t a0,uint32x4_t a1)68 static INLINE uint8x16x2_t vpx_vtrnq_u64_to_u8(uint32x4_t a0, uint32x4_t a1) {
69 uint8x16x2_t b0;
70 #if VPX_ARCH_AARCH64
71 b0.val[0] = vreinterpretq_u8_u64(
72 vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
73 b0.val[1] = vreinterpretq_u8_u64(
74 vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
75 #else
76 b0.val[0] = vcombine_u8(vreinterpret_u8_u32(vget_low_u32(a0)),
77 vreinterpret_u8_u32(vget_low_u32(a1)));
78 b0.val[1] = vcombine_u8(vreinterpret_u8_u32(vget_high_u32(a0)),
79 vreinterpret_u8_u32(vget_high_u32(a1)));
80 #endif
81 return b0;
82 }
83
vpx_vtrnq_u64_to_u16(uint32x4_t a0,uint32x4_t a1)84 static INLINE uint16x8x2_t vpx_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) {
85 uint16x8x2_t b0;
86 #if VPX_ARCH_AARCH64
87 b0.val[0] = vreinterpretq_u16_u64(
88 vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
89 b0.val[1] = vreinterpretq_u16_u64(
90 vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
91 #else
92 b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
93 vreinterpret_u16_u32(vget_low_u32(a1)));
94 b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
95 vreinterpret_u16_u32(vget_high_u32(a1)));
96 #endif
97 return b0;
98 }
99
transpose_u8_4x4(uint8x8_t * a0,uint8x8_t * a1)100 static INLINE void transpose_u8_4x4(uint8x8_t *a0, uint8x8_t *a1) {
101 // Swap 16 bit elements. Goes from:
102 // a0: 00 01 02 03 10 11 12 13
103 // a1: 20 21 22 23 30 31 32 33
104 // to:
105 // b0.val[0]: 00 01 20 21 10 11 30 31
106 // b0.val[1]: 02 03 22 23 12 13 32 33
107
108 const uint16x4x2_t b0 =
109 vtrn_u16(vreinterpret_u16_u8(*a0), vreinterpret_u16_u8(*a1));
110
111 // Swap 32 bit elements resulting in:
112 // c0.val[0]: 00 01 20 21 02 03 22 23
113 // c0.val[1]: 10 11 30 31 12 13 32 33
114
115 const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
116 vreinterpret_u32_u16(b0.val[1]));
117
118 // Swap 8 bit elements resulting in:
119 // d0.val[0]: 00 10 20 30 02 12 22 32
120 // d0.val[1]: 01 11 21 31 03 13 23 33
121
122 const uint8x8x2_t d0 =
123 vtrn_u8(vreinterpret_u8_u32(c0.val[0]), vreinterpret_u8_u32(c0.val[1]));
124
125 *a0 = d0.val[0];
126 *a1 = d0.val[1];
127 }
128
transpose_s16_4x4d(int16x4_t * a0,int16x4_t * a1,int16x4_t * a2,int16x4_t * a3)129 static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1,
130 int16x4_t *a2, int16x4_t *a3) {
131 // Swap 16 bit elements. Goes from:
132 // a0: 00 01 02 03
133 // a1: 10 11 12 13
134 // a2: 20 21 22 23
135 // a3: 30 31 32 33
136 // to:
137 // b0.val[0]: 00 10 02 12
138 // b0.val[1]: 01 11 03 13
139 // b1.val[0]: 20 30 22 32
140 // b1.val[1]: 21 31 23 33
141
142 const int16x4x2_t b0 = vtrn_s16(*a0, *a1);
143 const int16x4x2_t b1 = vtrn_s16(*a2, *a3);
144
145 // Swap 32 bit elements resulting in:
146 // c0.val[0]: 00 10 20 30
147 // c0.val[1]: 02 12 22 32
148 // c1.val[0]: 01 11 21 31
149 // c1.val[1]: 03 13 23 33
150
151 const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
152 vreinterpret_s32_s16(b1.val[0]));
153 const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
154 vreinterpret_s32_s16(b1.val[1]));
155
156 *a0 = vreinterpret_s16_s32(c0.val[0]);
157 *a1 = vreinterpret_s16_s32(c1.val[0]);
158 *a2 = vreinterpret_s16_s32(c0.val[1]);
159 *a3 = vreinterpret_s16_s32(c1.val[1]);
160 }
161
transpose_s16_4x4q(int16x8_t * a0,int16x8_t * a1)162 static INLINE void transpose_s16_4x4q(int16x8_t *a0, int16x8_t *a1) {
163 // Swap 32 bit elements. Goes from:
164 // a0: 00 01 02 03 10 11 12 13
165 // a1: 20 21 22 23 30 31 32 33
166 // to:
167 // b0.val[0]: 00 01 20 21 10 11 30 31
168 // b0.val[1]: 02 03 22 23 12 13 32 33
169
170 const int32x4x2_t b0 =
171 vtrnq_s32(vreinterpretq_s32_s16(*a0), vreinterpretq_s32_s16(*a1));
172
173 // Swap 64 bit elements resulting in:
174 // c0: 00 01 20 21 02 03 22 23
175 // c1: 10 11 30 31 12 13 32 33
176
177 const int16x8x2_t c0 = vpx_vtrnq_s64_to_s16(b0.val[0], b0.val[1]);
178
179 // Swap 16 bit elements resulting in:
180 // d0.val[0]: 00 10 20 30 02 12 22 32
181 // d0.val[1]: 01 11 21 31 03 13 23 33
182
183 const int16x8x2_t d0 = vtrnq_s16(c0.val[0], c0.val[1]);
184
185 *a0 = d0.val[0];
186 *a1 = d0.val[1];
187 }
188
transpose_u16_4x4q(uint16x8_t * a0,uint16x8_t * a1)189 static INLINE void transpose_u16_4x4q(uint16x8_t *a0, uint16x8_t *a1) {
190 // Swap 32 bit elements. Goes from:
191 // a0: 00 01 02 03 10 11 12 13
192 // a1: 20 21 22 23 30 31 32 33
193 // to:
194 // b0.val[0]: 00 01 20 21 10 11 30 31
195 // b0.val[1]: 02 03 22 23 12 13 32 33
196
197 const uint32x4x2_t b0 =
198 vtrnq_u32(vreinterpretq_u32_u16(*a0), vreinterpretq_u32_u16(*a1));
199
200 // Swap 64 bit elements resulting in:
201 // c0: 00 01 20 21 02 03 22 23
202 // c1: 10 11 30 31 12 13 32 33
203
204 const uint16x8x2_t c0 = vpx_vtrnq_u64_to_u16(b0.val[0], b0.val[1]);
205
206 // Swap 16 bit elements resulting in:
207 // d0.val[0]: 00 10 20 30 02 12 22 32
208 // d0.val[1]: 01 11 21 31 03 13 23 33
209
210 const uint16x8x2_t d0 = vtrnq_u16(c0.val[0], c0.val[1]);
211
212 *a0 = d0.val[0];
213 *a1 = d0.val[1];
214 }
215
transpose_u8_4x8(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3,const uint8x8_t a4,const uint8x8_t a5,const uint8x8_t a6,const uint8x8_t a7)216 static INLINE void transpose_u8_4x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
217 uint8x8_t *a3, const uint8x8_t a4,
218 const uint8x8_t a5, const uint8x8_t a6,
219 const uint8x8_t a7) {
220 // Swap 32 bit elements. Goes from:
221 // a0: 00 01 02 03 XX XX XX XX
222 // a1: 10 11 12 13 XX XX XX XX
223 // a2: 20 21 22 23 XX XX XX XX
224 // a3; 30 31 32 33 XX XX XX XX
225 // a4: 40 41 42 43 XX XX XX XX
226 // a5: 50 51 52 53 XX XX XX XX
227 // a6: 60 61 62 63 XX XX XX XX
228 // a7: 70 71 72 73 XX XX XX XX
229 // to:
230 // b0.val[0]: 00 01 02 03 40 41 42 43
231 // b1.val[0]: 10 11 12 13 50 51 52 53
232 // b2.val[0]: 20 21 22 23 60 61 62 63
233 // b3.val[0]: 30 31 32 33 70 71 72 73
234
235 const uint32x2x2_t b0 =
236 vtrn_u32(vreinterpret_u32_u8(*a0), vreinterpret_u32_u8(a4));
237 const uint32x2x2_t b1 =
238 vtrn_u32(vreinterpret_u32_u8(*a1), vreinterpret_u32_u8(a5));
239 const uint32x2x2_t b2 =
240 vtrn_u32(vreinterpret_u32_u8(*a2), vreinterpret_u32_u8(a6));
241 const uint32x2x2_t b3 =
242 vtrn_u32(vreinterpret_u32_u8(*a3), vreinterpret_u32_u8(a7));
243
244 // Swap 16 bit elements resulting in:
245 // c0.val[0]: 00 01 20 21 40 41 60 61
246 // c0.val[1]: 02 03 22 23 42 43 62 63
247 // c1.val[0]: 10 11 30 31 50 51 70 71
248 // c1.val[1]: 12 13 32 33 52 53 72 73
249
250 const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]),
251 vreinterpret_u16_u32(b2.val[0]));
252 const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]),
253 vreinterpret_u16_u32(b3.val[0]));
254
255 // Swap 8 bit elements resulting in:
256 // d0.val[0]: 00 10 20 30 40 50 60 70
257 // d0.val[1]: 01 11 21 31 41 51 61 71
258 // d1.val[0]: 02 12 22 32 42 52 62 72
259 // d1.val[1]: 03 13 23 33 43 53 63 73
260
261 const uint8x8x2_t d0 =
262 vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0]));
263 const uint8x8x2_t d1 =
264 vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1]));
265
266 *a0 = d0.val[0];
267 *a1 = d0.val[1];
268 *a2 = d1.val[0];
269 *a3 = d1.val[1];
270 }
271
transpose_s32_4x4(int32x4_t * a0,int32x4_t * a1,int32x4_t * a2,int32x4_t * a3)272 static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1,
273 int32x4_t *a2, int32x4_t *a3) {
274 // Swap 32 bit elements. Goes from:
275 // a0: 00 01 02 03
276 // a1: 10 11 12 13
277 // a2: 20 21 22 23
278 // a3: 30 31 32 33
279 // to:
280 // b0.val[0]: 00 10 02 12
281 // b0.val[1]: 01 11 03 13
282 // b1.val[0]: 20 30 22 32
283 // b1.val[1]: 21 31 23 33
284
285 const int32x4x2_t b0 = vtrnq_s32(*a0, *a1);
286 const int32x4x2_t b1 = vtrnq_s32(*a2, *a3);
287
288 // Swap 64 bit elements resulting in:
289 // c0.val[0]: 00 10 20 30
290 // c0.val[1]: 02 12 22 32
291 // c1.val[0]: 01 11 21 31
292 // c1.val[1]: 03 13 23 33
293
294 const int32x4x2_t c0 = vpx_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
295 const int32x4x2_t c1 = vpx_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
296
297 *a0 = c0.val[0];
298 *a1 = c1.val[0];
299 *a2 = c0.val[1];
300 *a3 = c1.val[1];
301 }
302
transpose_s16_4x8(const int16x4_t a0,const int16x4_t a1,const int16x4_t a2,const int16x4_t a3,const int16x4_t a4,const int16x4_t a5,const int16x4_t a6,const int16x4_t a7,int16x8_t * const o0,int16x8_t * const o1,int16x8_t * const o2,int16x8_t * const o3)303 static INLINE void transpose_s16_4x8(const int16x4_t a0, const int16x4_t a1,
304 const int16x4_t a2, const int16x4_t a3,
305 const int16x4_t a4, const int16x4_t a5,
306 const int16x4_t a6, const int16x4_t a7,
307 int16x8_t *const o0, int16x8_t *const o1,
308 int16x8_t *const o2, int16x8_t *const o3) {
309 // Combine rows. Goes from:
310 // a0: 00 01 02 03
311 // a1: 10 11 12 13
312 // a2: 20 21 22 23
313 // a3: 30 31 32 33
314 // a4: 40 41 42 43
315 // a5: 50 51 52 53
316 // a6: 60 61 62 63
317 // a7: 70 71 72 73
318 // to:
319 // b0: 00 01 02 03 40 41 42 43
320 // b1: 10 11 12 13 50 51 52 53
321 // b2: 20 21 22 23 60 61 62 63
322 // b3: 30 31 32 33 70 71 72 73
323
324 const int16x8_t b0 = vcombine_s16(a0, a4);
325 const int16x8_t b1 = vcombine_s16(a1, a5);
326 const int16x8_t b2 = vcombine_s16(a2, a6);
327 const int16x8_t b3 = vcombine_s16(a3, a7);
328
329 // Swap 16 bit elements resulting in:
330 // c0.val[0]: 00 10 02 12 40 50 42 52
331 // c0.val[1]: 01 11 03 13 41 51 43 53
332 // c1.val[0]: 20 30 22 32 60 70 62 72
333 // c1.val[1]: 21 31 23 33 61 71 63 73
334
335 const int16x8x2_t c0 = vtrnq_s16(b0, b1);
336 const int16x8x2_t c1 = vtrnq_s16(b2, b3);
337
338 // Swap 32 bit elements resulting in:
339 // d0.val[0]: 00 10 20 30 40 50 60 70
340 // d0.val[1]: 02 12 22 32 42 52 62 72
341 // d1.val[0]: 01 11 21 31 41 51 61 71
342 // d1.val[1]: 03 13 23 33 43 53 63 73
343
344 const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
345 vreinterpretq_s32_s16(c1.val[0]));
346 const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
347 vreinterpretq_s32_s16(c1.val[1]));
348
349 *o0 = vreinterpretq_s16_s32(d0.val[0]);
350 *o1 = vreinterpretq_s16_s32(d1.val[0]);
351 *o2 = vreinterpretq_s16_s32(d0.val[1]);
352 *o3 = vreinterpretq_s16_s32(d1.val[1]);
353 }
354
transpose_s32_4x8(int32x4_t * const a0,int32x4_t * const a1,int32x4_t * const a2,int32x4_t * const a3,int32x4_t * const a4,int32x4_t * const a5,int32x4_t * const a6,int32x4_t * const a7)355 static INLINE void transpose_s32_4x8(int32x4_t *const a0, int32x4_t *const a1,
356 int32x4_t *const a2, int32x4_t *const a3,
357 int32x4_t *const a4, int32x4_t *const a5,
358 int32x4_t *const a6, int32x4_t *const a7) {
359 // Swap 32 bit elements. Goes from:
360 // a0: 00 01 02 03
361 // a1: 10 11 12 13
362 // a2: 20 21 22 23
363 // a3: 30 31 32 33
364 // a4: 40 41 42 43
365 // a5: 50 51 52 53
366 // a6: 60 61 62 63
367 // a7: 70 71 72 73
368 // to:
369 // b0.val[0]: 00 10 02 12
370 // b0.val[1]: 01 11 03 13
371 // b1.val[0]: 20 30 22 32
372 // b1.val[1]: 21 31 23 33
373 // b2.val[0]: 40 50 42 52
374 // b2.val[1]: 41 51 43 53
375 // b3.val[0]: 60 70 62 72
376 // b3.val[1]: 61 71 63 73
377
378 const int32x4x2_t b0 = vtrnq_s32(*a0, *a1);
379 const int32x4x2_t b1 = vtrnq_s32(*a2, *a3);
380 const int32x4x2_t b2 = vtrnq_s32(*a4, *a5);
381 const int32x4x2_t b3 = vtrnq_s32(*a6, *a7);
382
383 // Swap 64 bit elements resulting in:
384 // c0.val[0]: 00 10 20 30
385 // c0.val[1]: 02 12 22 32
386 // c1.val[0]: 01 11 21 31
387 // c1.val[1]: 03 13 23 33
388 // c2.val[0]: 40 50 60 70
389 // c2.val[1]: 42 52 62 72
390 // c3.val[0]: 41 51 61 71
391 // c3.val[1]: 43 53 63 73
392
393 const int64x2x2_t c0 = vpx_vtrnq_s64(b0.val[0], b1.val[0]);
394 const int64x2x2_t c1 = vpx_vtrnq_s64(b0.val[1], b1.val[1]);
395 const int64x2x2_t c2 = vpx_vtrnq_s64(b2.val[0], b3.val[0]);
396 const int64x2x2_t c3 = vpx_vtrnq_s64(b2.val[1], b3.val[1]);
397
398 *a0 = vreinterpretq_s32_s64(c0.val[0]);
399 *a1 = vreinterpretq_s32_s64(c2.val[0]);
400 *a2 = vreinterpretq_s32_s64(c1.val[0]);
401 *a3 = vreinterpretq_s32_s64(c3.val[0]);
402 *a4 = vreinterpretq_s32_s64(c0.val[1]);
403 *a5 = vreinterpretq_s32_s64(c2.val[1]);
404 *a6 = vreinterpretq_s32_s64(c1.val[1]);
405 *a7 = vreinterpretq_s32_s64(c3.val[1]);
406 }
407
transpose_u8_8x4(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3)408 static INLINE void transpose_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
409 uint8x8_t *a3) {
410 // Swap 8 bit elements. Goes from:
411 // a0: 00 01 02 03 04 05 06 07
412 // a1: 10 11 12 13 14 15 16 17
413 // a2: 20 21 22 23 24 25 26 27
414 // a3: 30 31 32 33 34 35 36 37
415 // to:
416 // b0.val[0]: 00 10 02 12 04 14 06 16
417 // b0.val[1]: 01 11 03 13 05 15 07 17
418 // b1.val[0]: 20 30 22 32 24 34 26 36
419 // b1.val[1]: 21 31 23 33 25 35 27 37
420
421 const uint8x8x2_t b0 = vtrn_u8(*a0, *a1);
422 const uint8x8x2_t b1 = vtrn_u8(*a2, *a3);
423
424 // Swap 16 bit elements resulting in:
425 // c0.val[0]: 00 10 20 30 04 14 24 34
426 // c0.val[1]: 02 12 22 32 06 16 26 36
427 // c1.val[0]: 01 11 21 31 05 15 25 35
428 // c1.val[1]: 03 13 23 33 07 17 27 37
429
430 const uint16x4x2_t c0 =
431 vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0]));
432 const uint16x4x2_t c1 =
433 vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1]));
434
435 *a0 = vreinterpret_u8_u16(c0.val[0]);
436 *a1 = vreinterpret_u8_u16(c1.val[0]);
437 *a2 = vreinterpret_u8_u16(c0.val[1]);
438 *a3 = vreinterpret_u8_u16(c1.val[1]);
439 }
440
transpose_u16_8x4(uint16x8_t * a0,uint16x8_t * a1,uint16x8_t * a2,uint16x8_t * a3)441 static INLINE void transpose_u16_8x4(uint16x8_t *a0, uint16x8_t *a1,
442 uint16x8_t *a2, uint16x8_t *a3) {
443 // Swap 16 bit elements. Goes from:
444 // a0: 00 01 02 03 04 05 06 07
445 // a1: 10 11 12 13 14 15 16 17
446 // a2: 20 21 22 23 24 25 26 27
447 // a3: 30 31 32 33 34 35 36 37
448 // to:
449 // b0.val[0]: 00 10 02 12 04 14 06 16
450 // b0.val[1]: 01 11 03 13 05 15 07 17
451 // b1.val[0]: 20 30 22 32 24 34 26 36
452 // b1.val[1]: 21 31 23 33 25 35 27 37
453
454 const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
455 const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
456
457 // Swap 32 bit elements resulting in:
458 // c0.val[0]: 00 10 20 30 04 14 24 34
459 // c0.val[1]: 02 12 22 32 06 16 26 36
460 // c1.val[0]: 01 11 21 31 05 15 25 35
461 // c1.val[1]: 03 13 23 33 07 17 27 37
462
463 const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
464 vreinterpretq_u32_u16(b1.val[0]));
465 const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
466 vreinterpretq_u32_u16(b1.val[1]));
467
468 *a0 = vreinterpretq_u16_u32(c0.val[0]);
469 *a1 = vreinterpretq_u16_u32(c1.val[0]);
470 *a2 = vreinterpretq_u16_u32(c0.val[1]);
471 *a3 = vreinterpretq_u16_u32(c1.val[1]);
472 }
473
transpose_s32_8x4(int32x4_t * const a0,int32x4_t * const a1,int32x4_t * const a2,int32x4_t * const a3,int32x4_t * const a4,int32x4_t * const a5,int32x4_t * const a6,int32x4_t * const a7)474 static INLINE void transpose_s32_8x4(int32x4_t *const a0, int32x4_t *const a1,
475 int32x4_t *const a2, int32x4_t *const a3,
476 int32x4_t *const a4, int32x4_t *const a5,
477 int32x4_t *const a6, int32x4_t *const a7) {
478 // Swap 32 bit elements. Goes from:
479 // a0: 00 01 02 03
480 // a1: 04 05 06 07
481 // a2: 10 11 12 13
482 // a3: 14 15 16 17
483 // a4: 20 21 22 23
484 // a5: 24 25 26 27
485 // a6: 30 31 32 33
486 // a7: 34 35 36 37
487 // to:
488 // b0.val[0]: 00 10 02 12
489 // b0.val[1]: 01 11 03 13
490 // b1.val[0]: 04 14 06 16
491 // b1.val[1]: 05 15 07 17
492 // b2.val[0]: 20 30 22 32
493 // b2.val[1]: 21 31 23 33
494 // b3.val[0]: 24 34 26 36
495 // b3.val[1]: 25 35 27 37
496
497 const int32x4x2_t b0 = vtrnq_s32(*a0, *a2);
498 const int32x4x2_t b1 = vtrnq_s32(*a1, *a3);
499 const int32x4x2_t b2 = vtrnq_s32(*a4, *a6);
500 const int32x4x2_t b3 = vtrnq_s32(*a5, *a7);
501
502 // Swap 64 bit elements resulting in:
503 // c0.val[0]: 00 10 20 30
504 // c0.val[1]: 02 12 22 32
505 // c1.val[0]: 01 11 21 31
506 // c1.val[1]: 03 13 23 33
507 // c2.val[0]: 04 14 24 34
508 // c2.val[1]: 06 16 26 36
509 // c3.val[0]: 05 15 25 35
510 // c3.val[1]: 07 17 27 37
511
512 const int64x2x2_t c0 = vpx_vtrnq_s64(b0.val[0], b2.val[0]);
513 const int64x2x2_t c1 = vpx_vtrnq_s64(b0.val[1], b2.val[1]);
514 const int64x2x2_t c2 = vpx_vtrnq_s64(b1.val[0], b3.val[0]);
515 const int64x2x2_t c3 = vpx_vtrnq_s64(b1.val[1], b3.val[1]);
516
517 *a0 = vreinterpretq_s32_s64(c0.val[0]);
518 *a1 = vreinterpretq_s32_s64(c1.val[0]);
519 *a2 = vreinterpretq_s32_s64(c0.val[1]);
520 *a3 = vreinterpretq_s32_s64(c1.val[1]);
521 *a4 = vreinterpretq_s32_s64(c2.val[0]);
522 *a5 = vreinterpretq_s32_s64(c3.val[0]);
523 *a6 = vreinterpretq_s32_s64(c2.val[1]);
524 *a7 = vreinterpretq_s32_s64(c3.val[1]);
525 }
526
527 // Note: Using 'd' registers or 'q' registers has almost identical speed. We use
528 // 'q' registers here to save some instructions.
transpose_u8_8x8(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3,uint8x8_t * a4,uint8x8_t * a5,uint8x8_t * a6,uint8x8_t * a7)529 static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
530 uint8x8_t *a3, uint8x8_t *a4, uint8x8_t *a5,
531 uint8x8_t *a6, uint8x8_t *a7) {
532 // Swap 8 bit elements. Goes from:
533 // a0: 00 01 02 03 04 05 06 07
534 // a1: 10 11 12 13 14 15 16 17
535 // a2: 20 21 22 23 24 25 26 27
536 // a3: 30 31 32 33 34 35 36 37
537 // a4: 40 41 42 43 44 45 46 47
538 // a5: 50 51 52 53 54 55 56 57
539 // a6: 60 61 62 63 64 65 66 67
540 // a7: 70 71 72 73 74 75 76 77
541 // to:
542 // b0.val[0]: 00 10 02 12 04 14 06 16 40 50 42 52 44 54 46 56
543 // b0.val[1]: 01 11 03 13 05 15 07 17 41 51 43 53 45 55 47 57
544 // b1.val[0]: 20 30 22 32 24 34 26 36 60 70 62 72 64 74 66 76
545 // b1.val[1]: 21 31 23 33 25 35 27 37 61 71 63 73 65 75 67 77
546
547 const uint8x16x2_t b0 =
548 vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5));
549 const uint8x16x2_t b1 =
550 vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7));
551
552 // Swap 16 bit elements resulting in:
553 // c0.val[0]: 00 10 20 30 04 14 24 34 40 50 60 70 44 54 64 74
554 // c0.val[1]: 02 12 22 32 06 16 26 36 42 52 62 72 46 56 66 76
555 // c1.val[0]: 01 11 21 31 05 15 25 35 41 51 61 71 45 55 65 75
556 // c1.val[1]: 03 13 23 33 07 17 27 37 43 53 63 73 47 57 67 77
557
558 const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
559 vreinterpretq_u16_u8(b1.val[0]));
560 const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
561 vreinterpretq_u16_u8(b1.val[1]));
562
563 // Unzip 32 bit elements resulting in:
564 // d0.val[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
565 // d0.val[1]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
566 // d1.val[0]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
567 // d1.val[1]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
568 const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
569 vreinterpretq_u32_u16(c1.val[0]));
570 const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
571 vreinterpretq_u32_u16(c1.val[1]));
572
573 *a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
574 *a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
575 *a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
576 *a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
577 *a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
578 *a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
579 *a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
580 *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
581 }
582
583 // Transpose 8x8 to a new location.
transpose_s16_8x8q(int16x8_t * a,int16x8_t * out)584 static INLINE void transpose_s16_8x8q(int16x8_t *a, int16x8_t *out) {
585 // Swap 16 bit elements. Goes from:
586 // a0: 00 01 02 03 04 05 06 07
587 // a1: 10 11 12 13 14 15 16 17
588 // a2: 20 21 22 23 24 25 26 27
589 // a3: 30 31 32 33 34 35 36 37
590 // a4: 40 41 42 43 44 45 46 47
591 // a5: 50 51 52 53 54 55 56 57
592 // a6: 60 61 62 63 64 65 66 67
593 // a7: 70 71 72 73 74 75 76 77
594 // to:
595 // b0.val[0]: 00 10 02 12 04 14 06 16
596 // b0.val[1]: 01 11 03 13 05 15 07 17
597 // b1.val[0]: 20 30 22 32 24 34 26 36
598 // b1.val[1]: 21 31 23 33 25 35 27 37
599 // b2.val[0]: 40 50 42 52 44 54 46 56
600 // b2.val[1]: 41 51 43 53 45 55 47 57
601 // b3.val[0]: 60 70 62 72 64 74 66 76
602 // b3.val[1]: 61 71 63 73 65 75 67 77
603
604 const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]);
605 const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]);
606 const int16x8x2_t b2 = vtrnq_s16(a[4], a[5]);
607 const int16x8x2_t b3 = vtrnq_s16(a[6], a[7]);
608
609 // Swap 32 bit elements resulting in:
610 // c0.val[0]: 00 10 20 30 04 14 24 34
611 // c0.val[1]: 02 12 22 32 06 16 26 36
612 // c1.val[0]: 01 11 21 31 05 15 25 35
613 // c1.val[1]: 03 13 23 33 07 17 27 37
614 // c2.val[0]: 40 50 60 70 44 54 64 74
615 // c2.val[1]: 42 52 62 72 46 56 66 76
616 // c3.val[0]: 41 51 61 71 45 55 65 75
617 // c3.val[1]: 43 53 63 73 47 57 67 77
618
619 const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
620 vreinterpretq_s32_s16(b1.val[0]));
621 const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
622 vreinterpretq_s32_s16(b1.val[1]));
623 const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
624 vreinterpretq_s32_s16(b3.val[0]));
625 const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
626 vreinterpretq_s32_s16(b3.val[1]));
627
628 // Swap 64 bit elements resulting in:
629 // d0.val[0]: 00 10 20 30 40 50 60 70
630 // d0.val[1]: 04 14 24 34 44 54 64 74
631 // d1.val[0]: 01 11 21 31 41 51 61 71
632 // d1.val[1]: 05 15 25 35 45 55 65 75
633 // d2.val[0]: 02 12 22 32 42 52 62 72
634 // d2.val[1]: 06 16 26 36 46 56 66 76
635 // d3.val[0]: 03 13 23 33 43 53 63 73
636 // d3.val[1]: 07 17 27 37 47 57 67 77
637
638 const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
639 const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
640 const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
641 const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
642
643 out[0] = d0.val[0];
644 out[1] = d1.val[0];
645 out[2] = d2.val[0];
646 out[3] = d3.val[0];
647 out[4] = d0.val[1];
648 out[5] = d1.val[1];
649 out[6] = d2.val[1];
650 out[7] = d3.val[1];
651 }
652
transpose_s16_8x8(int16x8_t * a0,int16x8_t * a1,int16x8_t * a2,int16x8_t * a3,int16x8_t * a4,int16x8_t * a5,int16x8_t * a6,int16x8_t * a7)653 static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
654 int16x8_t *a2, int16x8_t *a3,
655 int16x8_t *a4, int16x8_t *a5,
656 int16x8_t *a6, int16x8_t *a7) {
657 // Swap 16 bit elements. Goes from:
658 // a0: 00 01 02 03 04 05 06 07
659 // a1: 10 11 12 13 14 15 16 17
660 // a2: 20 21 22 23 24 25 26 27
661 // a3: 30 31 32 33 34 35 36 37
662 // a4: 40 41 42 43 44 45 46 47
663 // a5: 50 51 52 53 54 55 56 57
664 // a6: 60 61 62 63 64 65 66 67
665 // a7: 70 71 72 73 74 75 76 77
666 // to:
667 // b0.val[0]: 00 10 02 12 04 14 06 16
668 // b0.val[1]: 01 11 03 13 05 15 07 17
669 // b1.val[0]: 20 30 22 32 24 34 26 36
670 // b1.val[1]: 21 31 23 33 25 35 27 37
671 // b2.val[0]: 40 50 42 52 44 54 46 56
672 // b2.val[1]: 41 51 43 53 45 55 47 57
673 // b3.val[0]: 60 70 62 72 64 74 66 76
674 // b3.val[1]: 61 71 63 73 65 75 67 77
675
676 const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
677 const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
678 const int16x8x2_t b2 = vtrnq_s16(*a4, *a5);
679 const int16x8x2_t b3 = vtrnq_s16(*a6, *a7);
680
681 // Swap 32 bit elements resulting in:
682 // c0.val[0]: 00 10 20 30 04 14 24 34
683 // c0.val[1]: 02 12 22 32 06 16 26 36
684 // c1.val[0]: 01 11 21 31 05 15 25 35
685 // c1.val[1]: 03 13 23 33 07 17 27 37
686 // c2.val[0]: 40 50 60 70 44 54 64 74
687 // c2.val[1]: 42 52 62 72 46 56 66 76
688 // c3.val[0]: 41 51 61 71 45 55 65 75
689 // c3.val[1]: 43 53 63 73 47 57 67 77
690
691 const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
692 vreinterpretq_s32_s16(b1.val[0]));
693 const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
694 vreinterpretq_s32_s16(b1.val[1]));
695 const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
696 vreinterpretq_s32_s16(b3.val[0]));
697 const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
698 vreinterpretq_s32_s16(b3.val[1]));
699
700 // Swap 64 bit elements resulting in:
701 // d0.val[0]: 00 10 20 30 40 50 60 70
702 // d0.val[1]: 04 14 24 34 44 54 64 74
703 // d1.val[0]: 01 11 21 31 41 51 61 71
704 // d1.val[1]: 05 15 25 35 45 55 65 75
705 // d2.val[0]: 02 12 22 32 42 52 62 72
706 // d2.val[1]: 06 16 26 36 46 56 66 76
707 // d3.val[0]: 03 13 23 33 43 53 63 73
708 // d3.val[1]: 07 17 27 37 47 57 67 77
709
710 const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
711 const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
712 const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
713 const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
714
715 *a0 = d0.val[0];
716 *a1 = d1.val[0];
717 *a2 = d2.val[0];
718 *a3 = d3.val[0];
719 *a4 = d0.val[1];
720 *a5 = d1.val[1];
721 *a6 = d2.val[1];
722 *a7 = d3.val[1];
723 }
724
transpose_u16_8x8(uint16x8_t * a0,uint16x8_t * a1,uint16x8_t * a2,uint16x8_t * a3,uint16x8_t * a4,uint16x8_t * a5,uint16x8_t * a6,uint16x8_t * a7)725 static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1,
726 uint16x8_t *a2, uint16x8_t *a3,
727 uint16x8_t *a4, uint16x8_t *a5,
728 uint16x8_t *a6, uint16x8_t *a7) {
729 // Swap 16 bit elements. Goes from:
730 // a0: 00 01 02 03 04 05 06 07
731 // a1: 10 11 12 13 14 15 16 17
732 // a2: 20 21 22 23 24 25 26 27
733 // a3: 30 31 32 33 34 35 36 37
734 // a4: 40 41 42 43 44 45 46 47
735 // a5: 50 51 52 53 54 55 56 57
736 // a6: 60 61 62 63 64 65 66 67
737 // a7: 70 71 72 73 74 75 76 77
738 // to:
739 // b0.val[0]: 00 10 02 12 04 14 06 16
740 // b0.val[1]: 01 11 03 13 05 15 07 17
741 // b1.val[0]: 20 30 22 32 24 34 26 36
742 // b1.val[1]: 21 31 23 33 25 35 27 37
743 // b2.val[0]: 40 50 42 52 44 54 46 56
744 // b2.val[1]: 41 51 43 53 45 55 47 57
745 // b3.val[0]: 60 70 62 72 64 74 66 76
746 // b3.val[1]: 61 71 63 73 65 75 67 77
747
748 const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
749 const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
750 const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5);
751 const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7);
752
753 // Swap 32 bit elements resulting in:
754 // c0.val[0]: 00 10 20 30 04 14 24 34
755 // c0.val[1]: 02 12 22 32 06 16 26 36
756 // c1.val[0]: 01 11 21 31 05 15 25 35
757 // c1.val[1]: 03 13 23 33 07 17 27 37
758 // c2.val[0]: 40 50 60 70 44 54 64 74
759 // c2.val[1]: 42 52 62 72 46 56 66 76
760 // c3.val[0]: 41 51 61 71 45 55 65 75
761 // c3.val[1]: 43 53 63 73 47 57 67 77
762
763 const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
764 vreinterpretq_u32_u16(b1.val[0]));
765 const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
766 vreinterpretq_u32_u16(b1.val[1]));
767 const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]),
768 vreinterpretq_u32_u16(b3.val[0]));
769 const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]),
770 vreinterpretq_u32_u16(b3.val[1]));
771
772 // Swap 64 bit elements resulting in:
773 // d0.val[0]: 00 10 20 30 40 50 60 70
774 // d0.val[1]: 04 14 24 34 44 54 64 74
775 // d1.val[0]: 01 11 21 31 41 51 61 71
776 // d1.val[1]: 05 15 25 35 45 55 65 75
777 // d2.val[0]: 02 12 22 32 42 52 62 72
778 // d2.val[1]: 06 16 26 36 46 56 66 76
779 // d3.val[0]: 03 13 23 33 43 53 63 73
780 // d3.val[1]: 07 17 27 37 47 57 67 77
781
782 const uint16x8x2_t d0 = vpx_vtrnq_u64_to_u16(c0.val[0], c2.val[0]);
783 const uint16x8x2_t d1 = vpx_vtrnq_u64_to_u16(c1.val[0], c3.val[0]);
784 const uint16x8x2_t d2 = vpx_vtrnq_u64_to_u16(c0.val[1], c2.val[1]);
785 const uint16x8x2_t d3 = vpx_vtrnq_u64_to_u16(c1.val[1], c3.val[1]);
786
787 *a0 = d0.val[0];
788 *a1 = d1.val[0];
789 *a2 = d2.val[0];
790 *a3 = d3.val[0];
791 *a4 = d0.val[1];
792 *a5 = d1.val[1];
793 *a6 = d2.val[1];
794 *a7 = d3.val[1];
795 }
796
transpose_s32_8x8(int32x4x2_t * a0,int32x4x2_t * a1,int32x4x2_t * a2,int32x4x2_t * a3,int32x4x2_t * a4,int32x4x2_t * a5,int32x4x2_t * a6,int32x4x2_t * a7)797 static INLINE void transpose_s32_8x8(int32x4x2_t *a0, int32x4x2_t *a1,
798 int32x4x2_t *a2, int32x4x2_t *a3,
799 int32x4x2_t *a4, int32x4x2_t *a5,
800 int32x4x2_t *a6, int32x4x2_t *a7) {
801 // Swap 32 bit elements. Goes from:
802 // a0: 00 01 02 03 04 05 06 07
803 // a1: 10 11 12 13 14 15 16 17
804 // a2: 20 21 22 23 24 25 26 27
805 // a3: 30 31 32 33 34 35 36 37
806 // a4: 40 41 42 43 44 45 46 47
807 // a5: 50 51 52 53 54 55 56 57
808 // a6: 60 61 62 63 64 65 66 67
809 // a7: 70 71 72 73 74 75 76 77
810 // to:
811 // b0: 00 10 02 12 01 11 03 13
812 // b1: 20 30 22 32 21 31 23 33
813 // b2: 40 50 42 52 41 51 43 53
814 // b3: 60 70 62 72 61 71 63 73
815 // b4: 04 14 06 16 05 15 07 17
816 // b5: 24 34 26 36 25 35 27 37
817 // b6: 44 54 46 56 45 55 47 57
818 // b7: 64 74 66 76 65 75 67 77
819
820 const int32x4x2_t b0 = vtrnq_s32(a0->val[0], a1->val[0]);
821 const int32x4x2_t b1 = vtrnq_s32(a2->val[0], a3->val[0]);
822 const int32x4x2_t b2 = vtrnq_s32(a4->val[0], a5->val[0]);
823 const int32x4x2_t b3 = vtrnq_s32(a6->val[0], a7->val[0]);
824 const int32x4x2_t b4 = vtrnq_s32(a0->val[1], a1->val[1]);
825 const int32x4x2_t b5 = vtrnq_s32(a2->val[1], a3->val[1]);
826 const int32x4x2_t b6 = vtrnq_s32(a4->val[1], a5->val[1]);
827 const int32x4x2_t b7 = vtrnq_s32(a6->val[1], a7->val[1]);
828
829 // Swap 64 bit elements resulting in:
830 // c0: 00 10 20 30 02 12 22 32
831 // c1: 01 11 21 31 03 13 23 33
832 // c2: 40 50 60 70 42 52 62 72
833 // c3: 41 51 61 71 43 53 63 73
834 // c4: 04 14 24 34 06 16 26 36
835 // c5: 05 15 25 35 07 17 27 37
836 // c6: 44 54 64 74 46 56 66 76
837 // c7: 45 55 65 75 47 57 67 77
838 const int32x4x2_t c0 = vpx_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
839 const int32x4x2_t c1 = vpx_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
840 const int32x4x2_t c2 = vpx_vtrnq_s64_to_s32(b2.val[0], b3.val[0]);
841 const int32x4x2_t c3 = vpx_vtrnq_s64_to_s32(b2.val[1], b3.val[1]);
842 const int32x4x2_t c4 = vpx_vtrnq_s64_to_s32(b4.val[0], b5.val[0]);
843 const int32x4x2_t c5 = vpx_vtrnq_s64_to_s32(b4.val[1], b5.val[1]);
844 const int32x4x2_t c6 = vpx_vtrnq_s64_to_s32(b6.val[0], b7.val[0]);
845 const int32x4x2_t c7 = vpx_vtrnq_s64_to_s32(b6.val[1], b7.val[1]);
846
847 // Swap 128 bit elements resulting in:
848 // a0: 00 10 20 30 40 50 60 70
849 // a1: 01 11 21 31 41 51 61 71
850 // a2: 02 12 22 32 42 52 62 72
851 // a3: 03 13 23 33 43 53 63 73
852 // a4: 04 14 24 34 44 54 64 74
853 // a5: 05 15 25 35 45 55 65 75
854 // a6: 06 16 26 36 46 56 66 76
855 // a7: 07 17 27 37 47 57 67 77
856 a0->val[0] = c0.val[0];
857 a0->val[1] = c2.val[0];
858 a1->val[0] = c1.val[0];
859 a1->val[1] = c3.val[0];
860 a2->val[0] = c0.val[1];
861 a2->val[1] = c2.val[1];
862 a3->val[0] = c1.val[1];
863 a3->val[1] = c3.val[1];
864 a4->val[0] = c4.val[0];
865 a4->val[1] = c6.val[0];
866 a5->val[0] = c5.val[0];
867 a5->val[1] = c7.val[0];
868 a6->val[0] = c4.val[1];
869 a6->val[1] = c6.val[1];
870 a7->val[0] = c5.val[1];
871 a7->val[1] = c7.val[1];
872 }
873
874 // Helper transpose function for highbd FDCT variants
transpose_s32_8x8_2(int32x4_t * left,int32x4_t * right,int32x4_t * out_left,int32x4_t * out_right)875 static INLINE void transpose_s32_8x8_2(int32x4_t *left /*[8]*/,
876 int32x4_t *right /*[8]*/,
877 int32x4_t *out_left /*[8]*/,
878 int32x4_t *out_right /*[8]*/) {
879 int32x4x2_t out[8];
880
881 out[0].val[0] = left[0];
882 out[0].val[1] = right[0];
883 out[1].val[0] = left[1];
884 out[1].val[1] = right[1];
885 out[2].val[0] = left[2];
886 out[2].val[1] = right[2];
887 out[3].val[0] = left[3];
888 out[3].val[1] = right[3];
889 out[4].val[0] = left[4];
890 out[4].val[1] = right[4];
891 out[5].val[0] = left[5];
892 out[5].val[1] = right[5];
893 out[6].val[0] = left[6];
894 out[6].val[1] = right[6];
895 out[7].val[0] = left[7];
896 out[7].val[1] = right[7];
897
898 transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
899 &out[6], &out[7]);
900
901 out_left[0] = out[0].val[0];
902 out_left[1] = out[1].val[0];
903 out_left[2] = out[2].val[0];
904 out_left[3] = out[3].val[0];
905 out_left[4] = out[4].val[0];
906 out_left[5] = out[5].val[0];
907 out_left[6] = out[6].val[0];
908 out_left[7] = out[7].val[0];
909 out_right[0] = out[0].val[1];
910 out_right[1] = out[1].val[1];
911 out_right[2] = out[2].val[1];
912 out_right[3] = out[3].val[1];
913 out_right[4] = out[4].val[1];
914 out_right[5] = out[5].val[1];
915 out_right[6] = out[6].val[1];
916 out_right[7] = out[7].val[1];
917 }
918
transpose_s32_16x16(int32x4_t * left1,int32x4_t * right1,int32x4_t * left2,int32x4_t * right2)919 static INLINE void transpose_s32_16x16(int32x4_t *left1, int32x4_t *right1,
920 int32x4_t *left2, int32x4_t *right2) {
921 int32x4_t tl[16], tr[16];
922
923 // transpose the 4 8x8 quadrants separately but first swap quadrants 2 and 3.
924 tl[0] = left1[8];
925 tl[1] = left1[9];
926 tl[2] = left1[10];
927 tl[3] = left1[11];
928 tl[4] = left1[12];
929 tl[5] = left1[13];
930 tl[6] = left1[14];
931 tl[7] = left1[15];
932 tr[0] = right1[8];
933 tr[1] = right1[9];
934 tr[2] = right1[10];
935 tr[3] = right1[11];
936 tr[4] = right1[12];
937 tr[5] = right1[13];
938 tr[6] = right1[14];
939 tr[7] = right1[15];
940
941 left1[8] = left2[0];
942 left1[9] = left2[1];
943 left1[10] = left2[2];
944 left1[11] = left2[3];
945 left1[12] = left2[4];
946 left1[13] = left2[5];
947 left1[14] = left2[6];
948 left1[15] = left2[7];
949 right1[8] = right2[0];
950 right1[9] = right2[1];
951 right1[10] = right2[2];
952 right1[11] = right2[3];
953 right1[12] = right2[4];
954 right1[13] = right2[5];
955 right1[14] = right2[6];
956 right1[15] = right2[7];
957
958 left2[0] = tl[0];
959 left2[1] = tl[1];
960 left2[2] = tl[2];
961 left2[3] = tl[3];
962 left2[4] = tl[4];
963 left2[5] = tl[5];
964 left2[6] = tl[6];
965 left2[7] = tl[7];
966 right2[0] = tr[0];
967 right2[1] = tr[1];
968 right2[2] = tr[2];
969 right2[3] = tr[3];
970 right2[4] = tr[4];
971 right2[5] = tr[5];
972 right2[6] = tr[6];
973 right2[7] = tr[7];
974
975 transpose_s32_8x8_2(left1, right1, left1, right1);
976 transpose_s32_8x8_2(left2, right2, left2, right2);
977 transpose_s32_8x8_2(left1 + 8, right1 + 8, left1 + 8, right1 + 8);
978 transpose_s32_8x8_2(left2 + 8, right2 + 8, left2 + 8, right2 + 8);
979 }
980
transpose_u8_16x8(const uint8x16_t i0,const uint8x16_t i1,const uint8x16_t i2,const uint8x16_t i3,const uint8x16_t i4,const uint8x16_t i5,const uint8x16_t i6,const uint8x16_t i7,uint8x8_t * o0,uint8x8_t * o1,uint8x8_t * o2,uint8x8_t * o3,uint8x8_t * o4,uint8x8_t * o5,uint8x8_t * o6,uint8x8_t * o7,uint8x8_t * o8,uint8x8_t * o9,uint8x8_t * o10,uint8x8_t * o11,uint8x8_t * o12,uint8x8_t * o13,uint8x8_t * o14,uint8x8_t * o15)981 static INLINE void transpose_u8_16x8(
982 const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2,
983 const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5,
984 const uint8x16_t i6, const uint8x16_t i7, uint8x8_t *o0, uint8x8_t *o1,
985 uint8x8_t *o2, uint8x8_t *o3, uint8x8_t *o4, uint8x8_t *o5, uint8x8_t *o6,
986 uint8x8_t *o7, uint8x8_t *o8, uint8x8_t *o9, uint8x8_t *o10, uint8x8_t *o11,
987 uint8x8_t *o12, uint8x8_t *o13, uint8x8_t *o14, uint8x8_t *o15) {
988 // Swap 8 bit elements. Goes from:
989 // i0: 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F
990 // i1: 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E 1F
991 // i2: 20 21 22 23 24 25 26 27 28 29 2A 2B 2C 2D 2E 2F
992 // i3: 30 31 32 33 34 35 36 37 38 39 3A 3B 3C 3D 3E 3F
993 // i4: 40 41 42 43 44 45 46 47 48 49 4A 4B 4C 4D 4E 4F
994 // i5: 50 51 52 53 54 55 56 57 58 59 5A 5B 5C 5D 5E 5F
995 // i6: 60 61 62 63 64 65 66 67 68 69 6A 6B 6C 6D 6E 6F
996 // i7: 70 71 72 73 74 75 76 77 78 79 7A 7B 7C 7D 7E 7F
997 // to:
998 // b0.val[0]: 00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
999 // b0.val[1]: 01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
1000 // b1.val[0]: 20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
1001 // b1.val[1]: 21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
1002 // b2.val[0]: 40 50 42 52 44 54 46 56 48 58 4A 5A 4C 5C 4E 5E
1003 // b2.val[1]: 41 51 43 53 45 55 47 57 49 59 4B 5B 4D 5D 4F 5F
1004 // b3.val[0]: 60 70 62 72 64 74 66 76 68 78 6A 7A 6C 7C 6E 7E
1005 // b3.val[1]: 61 71 63 73 65 75 67 77 69 79 6B 7B 6D 7D 6F 7F
1006 const uint8x16x2_t b0 = vtrnq_u8(i0, i1);
1007 const uint8x16x2_t b1 = vtrnq_u8(i2, i3);
1008 const uint8x16x2_t b2 = vtrnq_u8(i4, i5);
1009 const uint8x16x2_t b3 = vtrnq_u8(i6, i7);
1010
1011 // Swap 16 bit elements resulting in:
1012 // c0.val[0]: 00 10 20 30 04 14 24 34 08 18 28 38 0C 1C 2C 3C
1013 // c0.val[1]: 02 12 22 32 06 16 26 36 0A 1A 2A 3A 0E 1E 2E 3E
1014 // c1.val[0]: 01 11 21 31 05 15 25 35 09 19 29 39 0D 1D 2D 3D
1015 // c1.val[1]: 03 13 23 33 07 17 27 37 0B 1B 2B 3B 0F 1F 2F 3F
1016 // c2.val[0]: 40 50 60 70 44 54 64 74 48 58 68 78 4C 5C 6C 7C
1017 // c2.val[1]: 42 52 62 72 46 56 66 76 4A 5A 6A 7A 4E 5E 6E 7E
1018 // c3.val[0]: 41 51 61 71 45 55 65 75 49 59 69 79 4D 5D 6D 7D
1019 // c3.val[1]: 43 53 63 73 47 57 67 77 4B 5B 6B 7B 4F 5F 6F 7F
1020 const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
1021 vreinterpretq_u16_u8(b1.val[0]));
1022 const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
1023 vreinterpretq_u16_u8(b1.val[1]));
1024 const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]),
1025 vreinterpretq_u16_u8(b3.val[0]));
1026 const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]),
1027 vreinterpretq_u16_u8(b3.val[1]));
1028
1029 // Swap 32 bit elements resulting in:
1030 // d0.val[0]: 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78
1031 // d0.val[1]: 04 14 24 34 44 54 64 74 0C 1C 2C 3C 4C 5C 6C 7C
1032 // d1.val[0]: 02 12 22 32 42 52 62 72 0A 1A 2A 3A 4A 5A 6A 7A
1033 // d1.val[1]: 06 16 26 36 46 56 66 76 0E 1E 2E 3E 4E 5E 6E 7E
1034 // d2.val[0]: 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79
1035 // d2.val[1]: 05 15 25 35 45 55 65 75 0D 1D 2D 3D 4D 5D 6D 7D
1036 // d3.val[0]: 03 13 23 33 43 53 63 73 0B 1B 2B 3B 4B 5B 6B 7B
1037 // d3.val[1]: 07 17 27 37 47 57 67 77 0F 1F 2F 3F 4F 5F 6F 7F
1038 const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
1039 vreinterpretq_u32_u16(c2.val[0]));
1040 const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
1041 vreinterpretq_u32_u16(c2.val[1]));
1042 const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]),
1043 vreinterpretq_u32_u16(c3.val[0]));
1044 const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]),
1045 vreinterpretq_u32_u16(c3.val[1]));
1046
1047 // Output:
1048 // o0 : 00 10 20 30 40 50 60 70
1049 // o1 : 01 11 21 31 41 51 61 71
1050 // o2 : 02 12 22 32 42 52 62 72
1051 // o3 : 03 13 23 33 43 53 63 73
1052 // o4 : 04 14 24 34 44 54 64 74
1053 // o5 : 05 15 25 35 45 55 65 75
1054 // o6 : 06 16 26 36 46 56 66 76
1055 // o7 : 07 17 27 37 47 57 67 77
1056 // o8 : 08 18 28 38 48 58 68 78
1057 // o9 : 09 19 29 39 49 59 69 79
1058 // o10: 0A 1A 2A 3A 4A 5A 6A 7A
1059 // o11: 0B 1B 2B 3B 4B 5B 6B 7B
1060 // o12: 0C 1C 2C 3C 4C 5C 6C 7C
1061 // o13: 0D 1D 2D 3D 4D 5D 6D 7D
1062 // o14: 0E 1E 2E 3E 4E 5E 6E 7E
1063 // o15: 0F 1F 2F 3F 4F 5F 6F 7F
1064 *o0 = vget_low_u8(vreinterpretq_u8_u32(d0.val[0]));
1065 *o1 = vget_low_u8(vreinterpretq_u8_u32(d2.val[0]));
1066 *o2 = vget_low_u8(vreinterpretq_u8_u32(d1.val[0]));
1067 *o3 = vget_low_u8(vreinterpretq_u8_u32(d3.val[0]));
1068 *o4 = vget_low_u8(vreinterpretq_u8_u32(d0.val[1]));
1069 *o5 = vget_low_u8(vreinterpretq_u8_u32(d2.val[1]));
1070 *o6 = vget_low_u8(vreinterpretq_u8_u32(d1.val[1]));
1071 *o7 = vget_low_u8(vreinterpretq_u8_u32(d3.val[1]));
1072 *o8 = vget_high_u8(vreinterpretq_u8_u32(d0.val[0]));
1073 *o9 = vget_high_u8(vreinterpretq_u8_u32(d2.val[0]));
1074 *o10 = vget_high_u8(vreinterpretq_u8_u32(d1.val[0]));
1075 *o11 = vget_high_u8(vreinterpretq_u8_u32(d3.val[0]));
1076 *o12 = vget_high_u8(vreinterpretq_u8_u32(d0.val[1]));
1077 *o13 = vget_high_u8(vreinterpretq_u8_u32(d2.val[1]));
1078 *o14 = vget_high_u8(vreinterpretq_u8_u32(d1.val[1]));
1079 *o15 = vget_high_u8(vreinterpretq_u8_u32(d3.val[1]));
1080 }
1081
transpose_u8_8x16(const uint8x8_t i0,const uint8x8_t i1,const uint8x8_t i2,const uint8x8_t i3,const uint8x8_t i4,const uint8x8_t i5,const uint8x8_t i6,const uint8x8_t i7,const uint8x8_t i8,const uint8x8_t i9,const uint8x8_t i10,const uint8x8_t i11,const uint8x8_t i12,const uint8x8_t i13,const uint8x8_t i14,const uint8x8_t i15,uint8x16_t * o0,uint8x16_t * o1,uint8x16_t * o2,uint8x16_t * o3,uint8x16_t * o4,uint8x16_t * o5,uint8x16_t * o6,uint8x16_t * o7)1082 static INLINE void transpose_u8_8x16(
1083 const uint8x8_t i0, const uint8x8_t i1, const uint8x8_t i2,
1084 const uint8x8_t i3, const uint8x8_t i4, const uint8x8_t i5,
1085 const uint8x8_t i6, const uint8x8_t i7, const uint8x8_t i8,
1086 const uint8x8_t i9, const uint8x8_t i10, const uint8x8_t i11,
1087 const uint8x8_t i12, const uint8x8_t i13, const uint8x8_t i14,
1088 const uint8x8_t i15, uint8x16_t *o0, uint8x16_t *o1, uint8x16_t *o2,
1089 uint8x16_t *o3, uint8x16_t *o4, uint8x16_t *o5, uint8x16_t *o6,
1090 uint8x16_t *o7) {
1091 // Combine 8 bit elements. Goes from:
1092 // i0 : 00 01 02 03 04 05 06 07
1093 // i1 : 10 11 12 13 14 15 16 17
1094 // i2 : 20 21 22 23 24 25 26 27
1095 // i3 : 30 31 32 33 34 35 36 37
1096 // i4 : 40 41 42 43 44 45 46 47
1097 // i5 : 50 51 52 53 54 55 56 57
1098 // i6 : 60 61 62 63 64 65 66 67
1099 // i7 : 70 71 72 73 74 75 76 77
1100 // i8 : 80 81 82 83 84 85 86 87
1101 // i9 : 90 91 92 93 94 95 96 97
1102 // i10: A0 A1 A2 A3 A4 A5 A6 A7
1103 // i11: B0 B1 B2 B3 B4 B5 B6 B7
1104 // i12: C0 C1 C2 C3 C4 C5 C6 C7
1105 // i13: D0 D1 D2 D3 D4 D5 D6 D7
1106 // i14: E0 E1 E2 E3 E4 E5 E6 E7
1107 // i15: F0 F1 F2 F3 F4 F5 F6 F7
1108 // to:
1109 // a0: 00 01 02 03 04 05 06 07 80 81 82 83 84 85 86 87
1110 // a1: 10 11 12 13 14 15 16 17 90 91 92 93 94 95 96 97
1111 // a2: 20 21 22 23 24 25 26 27 A0 A1 A2 A3 A4 A5 A6 A7
1112 // a3: 30 31 32 33 34 35 36 37 B0 B1 B2 B3 B4 B5 B6 B7
1113 // a4: 40 41 42 43 44 45 46 47 C0 C1 C2 C3 C4 C5 C6 C7
1114 // a5: 50 51 52 53 54 55 56 57 D0 D1 D2 D3 D4 D5 D6 D7
1115 // a6: 60 61 62 63 64 65 66 67 E0 E1 E2 E3 E4 E5 E6 E7
1116 // a7: 70 71 72 73 74 75 76 77 F0 F1 F2 F3 F4 F5 F6 F7
1117 const uint8x16_t a0 = vcombine_u8(i0, i8);
1118 const uint8x16_t a1 = vcombine_u8(i1, i9);
1119 const uint8x16_t a2 = vcombine_u8(i2, i10);
1120 const uint8x16_t a3 = vcombine_u8(i3, i11);
1121 const uint8x16_t a4 = vcombine_u8(i4, i12);
1122 const uint8x16_t a5 = vcombine_u8(i5, i13);
1123 const uint8x16_t a6 = vcombine_u8(i6, i14);
1124 const uint8x16_t a7 = vcombine_u8(i7, i15);
1125
1126 // Swap 8 bit elements resulting in:
1127 // b0.val[0]: 00 10 02 12 04 14 06 16 80 90 82 92 84 94 86 96
1128 // b0.val[1]: 01 11 03 13 05 15 07 17 81 91 83 93 85 95 87 97
1129 // b1.val[0]: 20 30 22 32 24 34 26 36 A0 B0 A2 B2 A4 B4 A6 B6
1130 // b1.val[1]: 21 31 23 33 25 35 27 37 A1 B1 A3 B3 A5 B5 A7 B7
1131 // b2.val[0]: 40 50 42 52 44 54 46 56 C0 D0 C2 D2 C4 D4 C6 D6
1132 // b2.val[1]: 41 51 43 53 45 55 47 57 C1 D1 C3 D3 C5 D5 C7 D7
1133 // b3.val[0]: 60 70 62 72 64 74 66 76 E0 F0 E2 F2 E4 F4 E6 F6
1134 // b3.val[1]: 61 71 63 73 65 75 67 77 E1 F1 E3 F3 E5 F5 E7 F7
1135 const uint8x16x2_t b0 = vtrnq_u8(a0, a1);
1136 const uint8x16x2_t b1 = vtrnq_u8(a2, a3);
1137 const uint8x16x2_t b2 = vtrnq_u8(a4, a5);
1138 const uint8x16x2_t b3 = vtrnq_u8(a6, a7);
1139
1140 // Swap 16 bit elements resulting in:
1141 // c0.val[0]: 00 10 20 30 04 14 24 34 80 90 A0 B0 84 94 A4 B4
1142 // c0.val[1]: 02 12 22 32 06 16 26 36 82 92 A2 B2 86 96 A6 B6
1143 // c1.val[0]: 01 11 21 31 05 15 25 35 81 91 A1 B1 85 95 A5 B5
1144 // c1.val[1]: 03 13 23 33 07 17 27 37 83 93 A3 B3 87 97 A7 B7
1145 // c2.val[0]: 40 50 60 70 44 54 64 74 C0 D0 E0 F0 C4 D4 E4 F4
1146 // c2.val[1]: 42 52 62 72 46 56 66 76 C2 D2 E2 F2 C6 D6 E6 F6
1147 // c3.val[0]: 41 51 61 71 45 55 65 75 C1 D1 E1 F1 C5 D5 E5 F5
1148 // c3.val[1]: 43 53 63 73 47 57 67 77 C3 D3 E3 F3 C7 D7 E7 F7
1149 const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
1150 vreinterpretq_u16_u8(b1.val[0]));
1151 const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
1152 vreinterpretq_u16_u8(b1.val[1]));
1153 const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]),
1154 vreinterpretq_u16_u8(b3.val[0]));
1155 const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]),
1156 vreinterpretq_u16_u8(b3.val[1]));
1157
1158 // Swap 32 bit elements resulting in:
1159 // d0.val[0]: 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0
1160 // d0.val[1]: 04 14 24 34 44 54 64 74 84 94 A4 B4 C4 D4 E4 F4
1161 // d1.val[0]: 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2
1162 // d1.val[1]: 06 16 26 36 46 56 66 76 86 96 A6 B6 C6 D6 E6 F6
1163 // d2.val[0]: 01 11 21 31 41 51 61 71 81 91 A1 B1 C1 D1 E1 F1
1164 // d2.val[1]: 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5
1165 // d3.val[0]: 03 13 23 33 43 53 63 73 83 93 A3 B3 C3 D3 E3 F3
1166 // d3.val[1]: 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7
1167 const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
1168 vreinterpretq_u32_u16(c2.val[0]));
1169 const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
1170 vreinterpretq_u32_u16(c2.val[1]));
1171 const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]),
1172 vreinterpretq_u32_u16(c3.val[0]));
1173 const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]),
1174 vreinterpretq_u32_u16(c3.val[1]));
1175
1176 // Output:
1177 // o0: 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0
1178 // o1: 01 11 21 31 41 51 61 71 81 91 A1 B1 C1 D1 E1 F1
1179 // o2: 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2
1180 // o3: 03 13 23 33 43 53 63 73 83 93 A3 B3 C3 D3 E3 F3
1181 // o4: 04 14 24 34 44 54 64 74 84 94 A4 B4 C4 D4 E4 F4
1182 // o5: 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5
1183 // o6: 06 16 26 36 46 56 66 76 86 96 A6 B6 C6 D6 E6 F6
1184 // o7: 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7
1185 *o0 = vreinterpretq_u8_u32(d0.val[0]);
1186 *o1 = vreinterpretq_u8_u32(d2.val[0]);
1187 *o2 = vreinterpretq_u8_u32(d1.val[0]);
1188 *o3 = vreinterpretq_u8_u32(d3.val[0]);
1189 *o4 = vreinterpretq_u8_u32(d0.val[1]);
1190 *o5 = vreinterpretq_u8_u32(d2.val[1]);
1191 *o6 = vreinterpretq_u8_u32(d1.val[1]);
1192 *o7 = vreinterpretq_u8_u32(d3.val[1]);
1193 }
1194
transpose_u8_16x16(const uint8x16_t i0,const uint8x16_t i1,const uint8x16_t i2,const uint8x16_t i3,const uint8x16_t i4,const uint8x16_t i5,const uint8x16_t i6,const uint8x16_t i7,const uint8x16_t i8,const uint8x16_t i9,const uint8x16_t i10,const uint8x16_t i11,const uint8x16_t i12,const uint8x16_t i13,const uint8x16_t i14,const uint8x16_t i15,uint8x16_t * o0,uint8x16_t * o1,uint8x16_t * o2,uint8x16_t * o3,uint8x16_t * o4,uint8x16_t * o5,uint8x16_t * o6,uint8x16_t * o7,uint8x16_t * o8,uint8x16_t * o9,uint8x16_t * o10,uint8x16_t * o11,uint8x16_t * o12,uint8x16_t * o13,uint8x16_t * o14,uint8x16_t * o15)1195 static INLINE void transpose_u8_16x16(
1196 const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2,
1197 const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5,
1198 const uint8x16_t i6, const uint8x16_t i7, const uint8x16_t i8,
1199 const uint8x16_t i9, const uint8x16_t i10, const uint8x16_t i11,
1200 const uint8x16_t i12, const uint8x16_t i13, const uint8x16_t i14,
1201 const uint8x16_t i15, uint8x16_t *o0, uint8x16_t *o1, uint8x16_t *o2,
1202 uint8x16_t *o3, uint8x16_t *o4, uint8x16_t *o5, uint8x16_t *o6,
1203 uint8x16_t *o7, uint8x16_t *o8, uint8x16_t *o9, uint8x16_t *o10,
1204 uint8x16_t *o11, uint8x16_t *o12, uint8x16_t *o13, uint8x16_t *o14,
1205 uint8x16_t *o15) {
1206 // Swap 8 bit elements. Goes from:
1207 // i0: 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F
1208 // i1: 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E 1F
1209 // i2: 20 21 22 23 24 25 26 27 28 29 2A 2B 2C 2D 2E 2F
1210 // i3: 30 31 32 33 34 35 36 37 38 39 3A 3B 3C 3D 3E 3F
1211 // i4: 40 41 42 43 44 45 46 47 48 49 4A 4B 4C 4D 4E 4F
1212 // i5: 50 51 52 53 54 55 56 57 58 59 5A 5B 5C 5D 5E 5F
1213 // i6: 60 61 62 63 64 65 66 67 68 69 6A 6B 6C 6D 6E 6F
1214 // i7: 70 71 72 73 74 75 76 77 78 79 7A 7B 7C 7D 7E 7F
1215 // i8: 80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F
1216 // i9: 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F
1217 // i10: A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF
1218 // i11: B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF
1219 // i12: C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF
1220 // i13: D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF
1221 // i14: E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF
1222 // i15: F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF
1223 // to:
1224 // b0.val[0]: 00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
1225 // b0.val[1]: 01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
1226 // b1.val[0]: 20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
1227 // b1.val[1]: 21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
1228 // b2.val[0]: 40 50 42 52 44 54 46 56 48 58 4A 5A 4C 5C 4E 5E
1229 // b2.val[1]: 41 51 43 53 45 55 47 57 49 59 4B 5B 4D 5D 4F 5F
1230 // b3.val[0]: 60 70 62 72 64 74 66 76 68 78 6A 7A 6C 7C 6E 7E
1231 // b3.val[1]: 61 71 63 73 65 75 67 77 69 79 6B 7B 6D 7D 6F 7F
1232 // b4.val[0]: 80 90 82 92 84 94 86 96 88 98 8A 9A 8C 9C 8E 9E
1233 // b4.val[1]: 81 91 83 93 85 95 87 97 89 99 8B 9B 8D 9D 8F 9F
1234 // b5.val[0]: A0 B0 A2 B2 A4 B4 A6 B6 A8 B8 AA BA AC BC AE BE
1235 // b5.val[1]: A1 B1 A3 B3 A5 B5 A7 B7 A9 B9 AB BB AD BD AF BF
1236 // b6.val[0]: C0 D0 C2 D2 C4 D4 C6 D6 C8 D8 CA DA CC DC CE DE
1237 // b6.val[1]: C1 D1 C3 D3 C5 D5 C7 D7 C9 D9 CB DB CD DD CF DF
1238 // b7.val[0]: E0 F0 E2 F2 E4 F4 E6 F6 E8 F8 EA FA EC FC EE FE
1239 // b7.val[1]: E1 F1 E3 F3 E5 F5 E7 F7 E9 F9 EB FB ED FD EF FF
1240 const uint8x16x2_t b0 = vtrnq_u8(i0, i1);
1241 const uint8x16x2_t b1 = vtrnq_u8(i2, i3);
1242 const uint8x16x2_t b2 = vtrnq_u8(i4, i5);
1243 const uint8x16x2_t b3 = vtrnq_u8(i6, i7);
1244 const uint8x16x2_t b4 = vtrnq_u8(i8, i9);
1245 const uint8x16x2_t b5 = vtrnq_u8(i10, i11);
1246 const uint8x16x2_t b6 = vtrnq_u8(i12, i13);
1247 const uint8x16x2_t b7 = vtrnq_u8(i14, i15);
1248
1249 // Swap 16 bit elements resulting in:
1250 // c0.val[0]: 00 10 20 30 04 14 24 34 08 18 28 38 0C 1C 2C 3C
1251 // c0.val[1]: 02 12 22 32 06 16 26 36 0A 1A 2A 3A 0E 1E 2E 3E
1252 // c1.val[0]: 01 11 21 31 05 15 25 35 09 19 29 39 0D 1D 2D 3D
1253 // c1.val[1]: 03 13 23 33 07 17 27 37 0B 1B 2B 3B 0F 1F 2F 3F
1254 // c2.val[0]: 40 50 60 70 44 54 64 74 48 58 68 78 4C 5C 6C 7C
1255 // c2.val[1]: 42 52 62 72 46 56 66 76 4A 5A 6A 7A 4E 5E 6E 7E
1256 // c3.val[0]: 41 51 61 71 45 55 65 75 49 59 69 79 4D 5D 6D 7D
1257 // c3.val[1]: 43 53 63 73 47 57 67 77 4B 5B 6B 7B 4F 5F 6F 7F
1258 // c4.val[0]: 80 90 A0 B0 84 94 A4 B4 88 98 A8 B8 8C 9C AC BC
1259 // c4.val[1]: 82 92 A2 B2 86 96 A6 B6 8A 9A AA BA 8E 9E AE BE
1260 // c5.val[0]: 81 91 A1 B1 85 95 A5 B5 89 99 A9 B9 8D 9D AD BD
1261 // c5.val[1]: 83 93 A3 B3 87 97 A7 B7 8B 9B AB BB 8F 9F AF BF
1262 // c6.val[0]: C0 D0 E0 F0 C4 D4 E4 F4 C8 D8 E8 F8 CC DC EC FC
1263 // c6.val[1]: C2 D2 E2 F2 C6 D6 E6 F6 CA DA EA FA CE DE EE FE
1264 // c7.val[0]: C1 D1 E1 F1 C5 D5 E5 F5 C9 D9 E9 F9 CD DD ED FD
1265 // c7.val[1]: C3 D3 E3 F3 C7 D7 E7 F7 CB DB EB FB CF DF EF FF
1266 const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
1267 vreinterpretq_u16_u8(b1.val[0]));
1268 const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
1269 vreinterpretq_u16_u8(b1.val[1]));
1270 const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]),
1271 vreinterpretq_u16_u8(b3.val[0]));
1272 const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]),
1273 vreinterpretq_u16_u8(b3.val[1]));
1274 const uint16x8x2_t c4 = vtrnq_u16(vreinterpretq_u16_u8(b4.val[0]),
1275 vreinterpretq_u16_u8(b5.val[0]));
1276 const uint16x8x2_t c5 = vtrnq_u16(vreinterpretq_u16_u8(b4.val[1]),
1277 vreinterpretq_u16_u8(b5.val[1]));
1278 const uint16x8x2_t c6 = vtrnq_u16(vreinterpretq_u16_u8(b6.val[0]),
1279 vreinterpretq_u16_u8(b7.val[0]));
1280 const uint16x8x2_t c7 = vtrnq_u16(vreinterpretq_u16_u8(b6.val[1]),
1281 vreinterpretq_u16_u8(b7.val[1]));
1282
1283 // Swap 32 bit elements resulting in:
1284 // d0.val[0]: 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78
1285 // d0.val[1]: 04 14 24 34 44 54 64 74 0C 1C 2C 3C 4C 5C 6C 7C
1286 // d1.val[0]: 02 12 22 32 42 52 62 72 0A 1A 2A 3A 4A 5A 6A 7A
1287 // d1.val[1]: 06 16 26 36 46 56 66 76 0E 1E 2E 3E 4E 5E 6E 7E
1288 // d2.val[0]: 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79
1289 // d2.val[1]: 05 15 25 35 45 55 65 75 0D 1D 2D 3D 4D 5D 6D 7D
1290 // d3.val[0]: 03 13 23 33 43 53 63 73 0B 1B 2B 3B 4B 5B 6B 7B
1291 // d3.val[1]: 07 17 27 37 47 57 67 77 0F 1F 2F 3F 4F 5F 6F 7F
1292 // d4.val[0]: 80 90 A0 B0 C0 D0 E0 F0 88 98 A8 B8 C8 D8 E8 F8
1293 // d4.val[1]: 84 94 A4 B4 C4 D4 E4 F4 8C 9C AC BC CC DC EC FC
1294 // d5.val[0]: 82 92 A2 B2 C2 D2 E2 F2 8A 9A AA BA CA DA EA FA
1295 // d5.val[1]: 86 96 A6 B6 C6 D6 E6 F6 8E 9E AE BE CE DE EE FE
1296 // d6.val[0]: 81 91 A1 B1 C1 D1 E1 F1 89 99 A9 B9 C9 D9 E9 F9
1297 // d6.val[1]: 85 95 A5 B5 C5 D5 E5 F5 8D 9D AD BD CD DD ED FD
1298 // d7.val[0]: 83 93 A3 B3 C3 D3 E3 F3 8B 9B AB BB CB DB EB FB
1299 // d7.val[1]: 87 97 A7 B7 C7 D7 E7 F7 8F 9F AF BF CF DF EF FF
1300 const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
1301 vreinterpretq_u32_u16(c2.val[0]));
1302 const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
1303 vreinterpretq_u32_u16(c2.val[1]));
1304 const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]),
1305 vreinterpretq_u32_u16(c3.val[0]));
1306 const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]),
1307 vreinterpretq_u32_u16(c3.val[1]));
1308 const uint32x4x2_t d4 = vtrnq_u32(vreinterpretq_u32_u16(c4.val[0]),
1309 vreinterpretq_u32_u16(c6.val[0]));
1310 const uint32x4x2_t d5 = vtrnq_u32(vreinterpretq_u32_u16(c4.val[1]),
1311 vreinterpretq_u32_u16(c6.val[1]));
1312 const uint32x4x2_t d6 = vtrnq_u32(vreinterpretq_u32_u16(c5.val[0]),
1313 vreinterpretq_u32_u16(c7.val[0]));
1314 const uint32x4x2_t d7 = vtrnq_u32(vreinterpretq_u32_u16(c5.val[1]),
1315 vreinterpretq_u32_u16(c7.val[1]));
1316
1317 // Swap 64 bit elements resulting in:
1318 // e0.val[0]: 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0
1319 // e0.val[1]: 08 18 28 38 48 58 68 78 88 98 A8 B8 C8 D8 E8 F8
1320 // e1.val[0]: 01 11 21 31 41 51 61 71 84 94 A4 B4 C4 D4 E4 F4
1321 // e1.val[1]: 09 19 29 39 49 59 69 79 89 99 A9 B9 C9 D9 E9 F9
1322 // e2.val[0]: 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2
1323 // e2.val[1]: 0A 1A 2A 3A 4A 5A 6A 7A 8A 9A AA BA CA DA EA FA
1324 // e3.val[0]: 03 13 23 33 43 53 63 73 86 96 A6 B6 C6 D6 E6 F6
1325 // e3.val[1]: 0B 1B 2B 3B 4B 5B 6B 7B 8B 9B AB BB CB DB EB FB
1326 // e4.val[0]: 04 14 24 34 44 54 64 74 81 91 A1 B1 C1 D1 E1 F1
1327 // e4.val[1]: 0C 1C 2C 3C 4C 5C 6C 7C 8C 9C AC BC CC DC EC FC
1328 // e5.val[0]: 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5
1329 // e5.val[1]: 0D 1D 2D 3D 4D 5D 6D 7D 8D 9D AD BD CD DD ED FD
1330 // e6.val[0]: 06 16 26 36 46 56 66 76 83 93 A3 B3 C3 D3 E3 F3
1331 // e6.val[1]: 0E 1E 2E 3E 4E 5E 6E 7E 8E 9E AE BE CE DE EE FE
1332 // e7.val[0]: 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7
1333 // e7.val[1]: 0F 1F 2F 3F 4F 5F 6F 7F 8F 9F AF BF CF DF EF FF
1334 const uint8x16x2_t e0 = vpx_vtrnq_u64_to_u8(d0.val[0], d4.val[0]);
1335 const uint8x16x2_t e1 = vpx_vtrnq_u64_to_u8(d2.val[0], d6.val[0]);
1336 const uint8x16x2_t e2 = vpx_vtrnq_u64_to_u8(d1.val[0], d5.val[0]);
1337 const uint8x16x2_t e3 = vpx_vtrnq_u64_to_u8(d3.val[0], d7.val[0]);
1338 const uint8x16x2_t e4 = vpx_vtrnq_u64_to_u8(d0.val[1], d4.val[1]);
1339 const uint8x16x2_t e5 = vpx_vtrnq_u64_to_u8(d2.val[1], d6.val[1]);
1340 const uint8x16x2_t e6 = vpx_vtrnq_u64_to_u8(d1.val[1], d5.val[1]);
1341 const uint8x16x2_t e7 = vpx_vtrnq_u64_to_u8(d3.val[1], d7.val[1]);
1342
1343 // Output:
1344 // o0 : 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0
1345 // o1 : 01 11 21 31 41 51 61 71 84 94 A4 B4 C4 D4 E4 F4
1346 // o2 : 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2
1347 // o3 : 03 13 23 33 43 53 63 73 86 96 A6 B6 C6 D6 E6 F6
1348 // o4 : 04 14 24 34 44 54 64 74 81 91 A1 B1 C1 D1 E1 F1
1349 // o5 : 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5
1350 // o6 : 06 16 26 36 46 56 66 76 83 93 A3 B3 C3 D3 E3 F3
1351 // o7 : 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7
1352 // o8 : 08 18 28 38 48 58 68 78 88 98 A8 B8 C8 D8 E8 F8
1353 // o9 : 09 19 29 39 49 59 69 79 89 99 A9 B9 C9 D9 E9 F9
1354 // o10: 0A 1A 2A 3A 4A 5A 6A 7A 8A 9A AA BA CA DA EA FA
1355 // o11: 0B 1B 2B 3B 4B 5B 6B 7B 8B 9B AB BB CB DB EB FB
1356 // o12: 0C 1C 2C 3C 4C 5C 6C 7C 8C 9C AC BC CC DC EC FC
1357 // o13: 0D 1D 2D 3D 4D 5D 6D 7D 8D 9D AD BD CD DD ED FD
1358 // o14: 0E 1E 2E 3E 4E 5E 6E 7E 8E 9E AE BE CE DE EE FE
1359 // o15: 0F 1F 2F 3F 4F 5F 6F 7F 8F 9F AF BF CF DF EF FF
1360 *o0 = e0.val[0];
1361 *o1 = e1.val[0];
1362 *o2 = e2.val[0];
1363 *o3 = e3.val[0];
1364 *o4 = e4.val[0];
1365 *o5 = e5.val[0];
1366 *o6 = e6.val[0];
1367 *o7 = e7.val[0];
1368 *o8 = e0.val[1];
1369 *o9 = e1.val[1];
1370 *o10 = e2.val[1];
1371 *o11 = e3.val[1];
1372 *o12 = e4.val[1];
1373 *o13 = e5.val[1];
1374 *o14 = e6.val[1];
1375 *o15 = e7.val[1];
1376 }
1377
transpose_s16_16x16(int16x8_t * in0,int16x8_t * in1)1378 static INLINE void transpose_s16_16x16(int16x8_t *in0, int16x8_t *in1) {
1379 int16x8_t t[8];
1380
1381 // transpose the 4 8x8 quadrants separately but first swap quadrants 2 and 3.
1382 t[0] = in0[8];
1383 t[1] = in0[9];
1384 t[2] = in0[10];
1385 t[3] = in0[11];
1386 t[4] = in0[12];
1387 t[5] = in0[13];
1388 t[6] = in0[14];
1389 t[7] = in0[15];
1390 in0[8] = in1[0];
1391 in0[9] = in1[1];
1392 in0[10] = in1[2];
1393 in0[11] = in1[3];
1394 in0[12] = in1[4];
1395 in0[13] = in1[5];
1396 in0[14] = in1[6];
1397 in0[15] = in1[7];
1398 in1[0] = t[0];
1399 in1[1] = t[1];
1400 in1[2] = t[2];
1401 in1[3] = t[3];
1402 in1[4] = t[4];
1403 in1[5] = t[5];
1404 in1[6] = t[6];
1405 in1[7] = t[7];
1406
1407 transpose_s16_8x8(&in0[0], &in0[1], &in0[2], &in0[3], &in0[4], &in0[5],
1408 &in0[6], &in0[7]);
1409 transpose_s16_8x8(&in0[8], &in0[9], &in0[10], &in0[11], &in0[12], &in0[13],
1410 &in0[14], &in0[15]);
1411 transpose_s16_8x8(&in1[0], &in1[1], &in1[2], &in1[3], &in1[4], &in1[5],
1412 &in1[6], &in1[7]);
1413 transpose_s16_8x8(&in1[8], &in1[9], &in1[10], &in1[11], &in1[12], &in1[13],
1414 &in1[14], &in1[15]);
1415 }
1416
load_and_transpose_u8_4x8(const uint8_t * a,const int a_stride,uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3)1417 static INLINE void load_and_transpose_u8_4x8(const uint8_t *a,
1418 const int a_stride, uint8x8_t *a0,
1419 uint8x8_t *a1, uint8x8_t *a2,
1420 uint8x8_t *a3) {
1421 uint8x8_t a4, a5, a6, a7;
1422 *a0 = vld1_u8(a);
1423 a += a_stride;
1424 *a1 = vld1_u8(a);
1425 a += a_stride;
1426 *a2 = vld1_u8(a);
1427 a += a_stride;
1428 *a3 = vld1_u8(a);
1429 a += a_stride;
1430 a4 = vld1_u8(a);
1431 a += a_stride;
1432 a5 = vld1_u8(a);
1433 a += a_stride;
1434 a6 = vld1_u8(a);
1435 a += a_stride;
1436 a7 = vld1_u8(a);
1437
1438 transpose_u8_4x8(a0, a1, a2, a3, a4, a5, a6, a7);
1439 }
1440
load_and_transpose_u8_8x8(const uint8_t * a,const int a_stride,uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3,uint8x8_t * a4,uint8x8_t * a5,uint8x8_t * a6,uint8x8_t * a7)1441 static INLINE void load_and_transpose_u8_8x8(const uint8_t *a,
1442 const int a_stride, uint8x8_t *a0,
1443 uint8x8_t *a1, uint8x8_t *a2,
1444 uint8x8_t *a3, uint8x8_t *a4,
1445 uint8x8_t *a5, uint8x8_t *a6,
1446 uint8x8_t *a7) {
1447 *a0 = vld1_u8(a);
1448 a += a_stride;
1449 *a1 = vld1_u8(a);
1450 a += a_stride;
1451 *a2 = vld1_u8(a);
1452 a += a_stride;
1453 *a3 = vld1_u8(a);
1454 a += a_stride;
1455 *a4 = vld1_u8(a);
1456 a += a_stride;
1457 *a5 = vld1_u8(a);
1458 a += a_stride;
1459 *a6 = vld1_u8(a);
1460 a += a_stride;
1461 *a7 = vld1_u8(a);
1462
1463 transpose_u8_8x8(a0, a1, a2, a3, a4, a5, a6, a7);
1464 }
1465
transpose_and_store_u8_8x8(uint8_t * a,const int a_stride,uint8x8_t a0,uint8x8_t a1,uint8x8_t a2,uint8x8_t a3,uint8x8_t a4,uint8x8_t a5,uint8x8_t a6,uint8x8_t a7)1466 static INLINE void transpose_and_store_u8_8x8(uint8_t *a, const int a_stride,
1467 uint8x8_t a0, uint8x8_t a1,
1468 uint8x8_t a2, uint8x8_t a3,
1469 uint8x8_t a4, uint8x8_t a5,
1470 uint8x8_t a6, uint8x8_t a7) {
1471 transpose_u8_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
1472
1473 vst1_u8(a, a0);
1474 a += a_stride;
1475 vst1_u8(a, a1);
1476 a += a_stride;
1477 vst1_u8(a, a2);
1478 a += a_stride;
1479 vst1_u8(a, a3);
1480 a += a_stride;
1481 vst1_u8(a, a4);
1482 a += a_stride;
1483 vst1_u8(a, a5);
1484 a += a_stride;
1485 vst1_u8(a, a6);
1486 a += a_stride;
1487 vst1_u8(a, a7);
1488 }
1489
load_and_transpose_s16_8x8(const int16_t * a,const int a_stride,int16x8_t * a0,int16x8_t * a1,int16x8_t * a2,int16x8_t * a3,int16x8_t * a4,int16x8_t * a5,int16x8_t * a6,int16x8_t * a7)1490 static INLINE void load_and_transpose_s16_8x8(const int16_t *a,
1491 const int a_stride, int16x8_t *a0,
1492 int16x8_t *a1, int16x8_t *a2,
1493 int16x8_t *a3, int16x8_t *a4,
1494 int16x8_t *a5, int16x8_t *a6,
1495 int16x8_t *a7) {
1496 *a0 = vld1q_s16(a);
1497 a += a_stride;
1498 *a1 = vld1q_s16(a);
1499 a += a_stride;
1500 *a2 = vld1q_s16(a);
1501 a += a_stride;
1502 *a3 = vld1q_s16(a);
1503 a += a_stride;
1504 *a4 = vld1q_s16(a);
1505 a += a_stride;
1506 *a5 = vld1q_s16(a);
1507 a += a_stride;
1508 *a6 = vld1q_s16(a);
1509 a += a_stride;
1510 *a7 = vld1q_s16(a);
1511
1512 transpose_s16_8x8(a0, a1, a2, a3, a4, a5, a6, a7);
1513 }
1514
load_and_transpose_s32_8x8(const int32_t * a,const int a_stride,int32x4x2_t * const a0,int32x4x2_t * const a1,int32x4x2_t * const a2,int32x4x2_t * const a3,int32x4x2_t * const a4,int32x4x2_t * const a5,int32x4x2_t * const a6,int32x4x2_t * const a7)1515 static INLINE void load_and_transpose_s32_8x8(
1516 const int32_t *a, const int a_stride, int32x4x2_t *const a0,
1517 int32x4x2_t *const a1, int32x4x2_t *const a2, int32x4x2_t *const a3,
1518 int32x4x2_t *const a4, int32x4x2_t *const a5, int32x4x2_t *const a6,
1519 int32x4x2_t *const a7) {
1520 a0->val[0] = vld1q_s32(a);
1521 a0->val[1] = vld1q_s32(a + 4);
1522 a += a_stride;
1523 a1->val[0] = vld1q_s32(a);
1524 a1->val[1] = vld1q_s32(a + 4);
1525 a += a_stride;
1526 a2->val[0] = vld1q_s32(a);
1527 a2->val[1] = vld1q_s32(a + 4);
1528 a += a_stride;
1529 a3->val[0] = vld1q_s32(a);
1530 a3->val[1] = vld1q_s32(a + 4);
1531 a += a_stride;
1532 a4->val[0] = vld1q_s32(a);
1533 a4->val[1] = vld1q_s32(a + 4);
1534 a += a_stride;
1535 a5->val[0] = vld1q_s32(a);
1536 a5->val[1] = vld1q_s32(a + 4);
1537 a += a_stride;
1538 a6->val[0] = vld1q_s32(a);
1539 a6->val[1] = vld1q_s32(a + 4);
1540 a += a_stride;
1541 a7->val[0] = vld1q_s32(a);
1542 a7->val[1] = vld1q_s32(a + 4);
1543
1544 transpose_s32_8x8(a0, a1, a2, a3, a4, a5, a6, a7);
1545 }
1546 #endif // VPX_VPX_DSP_ARM_TRANSPOSE_NEON_H_
1547