• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #ifndef AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_
12 #define AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_
13 
14 #include <arm_neon.h>
15 
transpose_u8_8x8(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3,uint8x8_t * a4,uint8x8_t * a5,uint8x8_t * a6,uint8x8_t * a7)16 static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
17                                     uint8x8_t *a3, uint8x8_t *a4, uint8x8_t *a5,
18                                     uint8x8_t *a6, uint8x8_t *a7) {
19   // Swap 8 bit elements. Goes from:
20   // a0: 00 01 02 03 04 05 06 07
21   // a1: 10 11 12 13 14 15 16 17
22   // a2: 20 21 22 23 24 25 26 27
23   // a3: 30 31 32 33 34 35 36 37
24   // a4: 40 41 42 43 44 45 46 47
25   // a5: 50 51 52 53 54 55 56 57
26   // a6: 60 61 62 63 64 65 66 67
27   // a7: 70 71 72 73 74 75 76 77
28   // to:
29   // b0.val[0]: 00 10 02 12 04 14 06 16  40 50 42 52 44 54 46 56
30   // b0.val[1]: 01 11 03 13 05 15 07 17  41 51 43 53 45 55 47 57
31   // b1.val[0]: 20 30 22 32 24 34 26 36  60 70 62 72 64 74 66 76
32   // b1.val[1]: 21 31 23 33 25 35 27 37  61 71 63 73 65 75 67 77
33 
34   const uint8x16x2_t b0 =
35       vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5));
36   const uint8x16x2_t b1 =
37       vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7));
38 
39   // Swap 16 bit elements resulting in:
40   // c0.val[0]: 00 10 20 30 04 14 24 34  40 50 60 70 44 54 64 74
41   // c0.val[1]: 02 12 22 32 06 16 26 36  42 52 62 72 46 56 66 76
42   // c1.val[0]: 01 11 21 31 05 15 25 35  41 51 61 71 45 55 65 75
43   // c1.val[1]: 03 13 23 33 07 17 27 37  43 53 63 73 47 57 67 77
44 
45   const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
46                                     vreinterpretq_u16_u8(b1.val[0]));
47   const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
48                                     vreinterpretq_u16_u8(b1.val[1]));
49 
50   // Unzip 32 bit elements resulting in:
51   // d0.val[0]: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
52   // d0.val[1]: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
53   // d1.val[0]: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
54   // d1.val[1]: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
55   const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
56                                     vreinterpretq_u32_u16(c1.val[0]));
57   const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
58                                     vreinterpretq_u32_u16(c1.val[1]));
59 
60   *a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
61   *a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
62   *a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
63   *a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
64   *a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
65   *a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
66   *a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
67   *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
68 }
69 
transpose_u8_8x4(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3)70 static INLINE void transpose_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
71                                     uint8x8_t *a3) {
72   // Swap 8 bit elements. Goes from:
73   // a0: 00 01 02 03 04 05 06 07
74   // a1: 10 11 12 13 14 15 16 17
75   // a2: 20 21 22 23 24 25 26 27
76   // a3: 30 31 32 33 34 35 36 37
77   // to:
78   // b0.val[0]: 00 10 02 12 04 14 06 16
79   // b0.val[1]: 01 11 03 13 05 15 07 17
80   // b1.val[0]: 20 30 22 32 24 34 26 36
81   // b1.val[1]: 21 31 23 33 25 35 27 37
82 
83   const uint8x8x2_t b0 = vtrn_u8(*a0, *a1);
84   const uint8x8x2_t b1 = vtrn_u8(*a2, *a3);
85 
86   // Swap 16 bit elements resulting in:
87   // c0.val[0]: 00 10 20 30 04 14 24 34
88   // c0.val[1]: 02 12 22 32 06 16 26 36
89   // c1.val[0]: 01 11 21 31 05 15 25 35
90   // c1.val[1]: 03 13 23 33 07 17 27 37
91 
92   const uint16x4x2_t c0 =
93       vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0]));
94   const uint16x4x2_t c1 =
95       vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1]));
96 
97   *a0 = vreinterpret_u8_u16(c0.val[0]);
98   *a1 = vreinterpret_u8_u16(c1.val[0]);
99   *a2 = vreinterpret_u8_u16(c0.val[1]);
100   *a3 = vreinterpret_u8_u16(c1.val[1]);
101 }
102 
transpose_u8_4x4(uint8x8_t * a0,uint8x8_t * a1)103 static INLINE void transpose_u8_4x4(uint8x8_t *a0, uint8x8_t *a1) {
104   // Swap 16 bit elements. Goes from:
105   // a0: 00 01 02 03  10 11 12 13
106   // a1: 20 21 22 23  30 31 32 33
107   // to:
108   // b0.val[0]: 00 01 20 21  10 11 30 31
109   // b0.val[1]: 02 03 22 23  12 13 32 33
110 
111   const uint16x4x2_t b0 =
112       vtrn_u16(vreinterpret_u16_u8(*a0), vreinterpret_u16_u8(*a1));
113 
114   // Swap 32 bit elements resulting in:
115   // c0.val[0]: 00 01 20 21  02 03 22 23
116   // c0.val[1]: 10 11 30 31  12 13 32 33
117 
118   const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
119                                    vreinterpret_u32_u16(b0.val[1]));
120 
121   // Swap 8 bit elements resulting in:
122   // d0.val[0]: 00 10 20 30  02 12 22 32
123   // d0.val[1]: 01 11 21 31  03 13 23 33
124 
125   const uint8x8x2_t d0 =
126       vtrn_u8(vreinterpret_u8_u32(c0.val[0]), vreinterpret_u8_u32(c0.val[1]));
127 
128   *a0 = d0.val[0];
129   *a1 = d0.val[1];
130 }
131 
transpose_u8_4x8(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3,const uint8x8_t a4,const uint8x8_t a5,const uint8x8_t a6,const uint8x8_t a7)132 static INLINE void transpose_u8_4x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
133                                     uint8x8_t *a3, const uint8x8_t a4,
134                                     const uint8x8_t a5, const uint8x8_t a6,
135                                     const uint8x8_t a7) {
136   // Swap 32 bit elements. Goes from:
137   // a0: 00 01 02 03 XX XX XX XX
138   // a1: 10 11 12 13 XX XX XX XX
139   // a2: 20 21 22 23 XX XX XX XX
140   // a3; 30 31 32 33 XX XX XX XX
141   // a4: 40 41 42 43 XX XX XX XX
142   // a5: 50 51 52 53 XX XX XX XX
143   // a6: 60 61 62 63 XX XX XX XX
144   // a7: 70 71 72 73 XX XX XX XX
145   // to:
146   // b0.val[0]: 00 01 02 03 40 41 42 43
147   // b1.val[0]: 10 11 12 13 50 51 52 53
148   // b2.val[0]: 20 21 22 23 60 61 62 63
149   // b3.val[0]: 30 31 32 33 70 71 72 73
150 
151   const uint32x2x2_t b0 =
152       vtrn_u32(vreinterpret_u32_u8(*a0), vreinterpret_u32_u8(a4));
153   const uint32x2x2_t b1 =
154       vtrn_u32(vreinterpret_u32_u8(*a1), vreinterpret_u32_u8(a5));
155   const uint32x2x2_t b2 =
156       vtrn_u32(vreinterpret_u32_u8(*a2), vreinterpret_u32_u8(a6));
157   const uint32x2x2_t b3 =
158       vtrn_u32(vreinterpret_u32_u8(*a3), vreinterpret_u32_u8(a7));
159 
160   // Swap 16 bit elements resulting in:
161   // c0.val[0]: 00 01 20 21 40 41 60 61
162   // c0.val[1]: 02 03 22 23 42 43 62 63
163   // c1.val[0]: 10 11 30 31 50 51 70 71
164   // c1.val[1]: 12 13 32 33 52 53 72 73
165 
166   const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]),
167                                    vreinterpret_u16_u32(b2.val[0]));
168   const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]),
169                                    vreinterpret_u16_u32(b3.val[0]));
170 
171   // Swap 8 bit elements resulting in:
172   // d0.val[0]: 00 10 20 30 40 50 60 70
173   // d0.val[1]: 01 11 21 31 41 51 61 71
174   // d1.val[0]: 02 12 22 32 42 52 62 72
175   // d1.val[1]: 03 13 23 33 43 53 63 73
176 
177   const uint8x8x2_t d0 =
178       vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0]));
179   const uint8x8x2_t d1 =
180       vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1]));
181 
182   *a0 = d0.val[0];
183   *a1 = d0.val[1];
184   *a2 = d1.val[0];
185   *a3 = d1.val[1];
186 }
187 
transpose_u16_4x8(uint16x4_t * a0,uint16x4_t * a1,uint16x4_t * a2,uint16x4_t * a3,uint16x4_t * a4,uint16x4_t * a5,uint16x4_t * a6,uint16x4_t * a7,uint16x8_t * o0,uint16x8_t * o1,uint16x8_t * o2,uint16x8_t * o3)188 static INLINE void transpose_u16_4x8(uint16x4_t *a0, uint16x4_t *a1,
189                                      uint16x4_t *a2, uint16x4_t *a3,
190                                      uint16x4_t *a4, uint16x4_t *a5,
191                                      uint16x4_t *a6, uint16x4_t *a7,
192                                      uint16x8_t *o0, uint16x8_t *o1,
193                                      uint16x8_t *o2, uint16x8_t *o3) {
194   // Swap 16 bit elements. Goes from:
195   // a0: 00 01 02 03
196   // a1: 10 11 12 13
197   // a2: 20 21 22 23
198   // a3: 30 31 32 33
199   // a4: 40 41 42 43
200   // a5: 50 51 52 53
201   // a6: 60 61 62 63
202   // a7: 70 71 72 73
203   // to:
204   // b0.val[0]: 00 10 02 12
205   // b0.val[1]: 01 11 03 13
206   // b1.val[0]: 20 30 22 32
207   // b1.val[1]: 21 31 23 33
208   // b2.val[0]: 40 50 42 52
209   // b2.val[1]: 41 51 43 53
210   // b3.val[0]: 60 70 62 72
211   // b3.val[1]: 61 71 63 73
212 
213   uint16x4x2_t b0 = vtrn_u16(*a0, *a1);
214   uint16x4x2_t b1 = vtrn_u16(*a2, *a3);
215   uint16x4x2_t b2 = vtrn_u16(*a4, *a5);
216   uint16x4x2_t b3 = vtrn_u16(*a6, *a7);
217 
218   // Swap 32 bit elements resulting in:
219   // c0.val[0]: 00 10 20 30
220   // c0.val[1]: 02 12 22 32
221   // c1.val[0]: 01 11 21 31
222   // c1.val[1]: 03 13 23 33
223   // c2.val[0]: 40 50 60 70
224   // c2.val[1]: 42 52 62 72
225   // c3.val[0]: 41 51 61 71
226   // c3.val[1]: 43 53 63 73
227 
228   uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
229                              vreinterpret_u32_u16(b1.val[0]));
230   uint32x2x2_t c1 = vtrn_u32(vreinterpret_u32_u16(b0.val[1]),
231                              vreinterpret_u32_u16(b1.val[1]));
232   uint32x2x2_t c2 = vtrn_u32(vreinterpret_u32_u16(b2.val[0]),
233                              vreinterpret_u32_u16(b3.val[0]));
234   uint32x2x2_t c3 = vtrn_u32(vreinterpret_u32_u16(b2.val[1]),
235                              vreinterpret_u32_u16(b3.val[1]));
236 
237   // Swap 64 bit elements resulting in:
238   // o0: 00 10 20 30 40 50 60 70
239   // o1: 01 11 21 31 41 51 61 71
240   // o2: 02 12 22 32 42 52 62 72
241   // o3: 03 13 23 33 43 53 63 73
242 
243   *o0 = vcombine_u16(vreinterpret_u16_u32(c0.val[0]),
244                      vreinterpret_u16_u32(c2.val[0]));
245   *o1 = vcombine_u16(vreinterpret_u16_u32(c1.val[0]),
246                      vreinterpret_u16_u32(c3.val[0]));
247   *o2 = vcombine_u16(vreinterpret_u16_u32(c0.val[1]),
248                      vreinterpret_u16_u32(c2.val[1]));
249   *o3 = vcombine_u16(vreinterpret_u16_u32(c1.val[1]),
250                      vreinterpret_u16_u32(c3.val[1]));
251 }
252 
transpose_s16_4x8(int16x4_t * a0,int16x4_t * a1,int16x4_t * a2,int16x4_t * a3,int16x4_t * a4,int16x4_t * a5,int16x4_t * a6,int16x4_t * a7,int16x8_t * o0,int16x8_t * o1,int16x8_t * o2,int16x8_t * o3)253 static INLINE void transpose_s16_4x8(int16x4_t *a0, int16x4_t *a1,
254                                      int16x4_t *a2, int16x4_t *a3,
255                                      int16x4_t *a4, int16x4_t *a5,
256                                      int16x4_t *a6, int16x4_t *a7,
257                                      int16x8_t *o0, int16x8_t *o1,
258                                      int16x8_t *o2, int16x8_t *o3) {
259   // Swap 16 bit elements. Goes from:
260   // a0: 00 01 02 03
261   // a1: 10 11 12 13
262   // a2: 20 21 22 23
263   // a3: 30 31 32 33
264   // a4: 40 41 42 43
265   // a5: 50 51 52 53
266   // a6: 60 61 62 63
267   // a7: 70 71 72 73
268   // to:
269   // b0.val[0]: 00 10 02 12
270   // b0.val[1]: 01 11 03 13
271   // b1.val[0]: 20 30 22 32
272   // b1.val[1]: 21 31 23 33
273   // b2.val[0]: 40 50 42 52
274   // b2.val[1]: 41 51 43 53
275   // b3.val[0]: 60 70 62 72
276   // b3.val[1]: 61 71 63 73
277 
278   int16x4x2_t b0 = vtrn_s16(*a0, *a1);
279   int16x4x2_t b1 = vtrn_s16(*a2, *a3);
280   int16x4x2_t b2 = vtrn_s16(*a4, *a5);
281   int16x4x2_t b3 = vtrn_s16(*a6, *a7);
282 
283   // Swap 32 bit elements resulting in:
284   // c0.val[0]: 00 10 20 30
285   // c0.val[1]: 02 12 22 32
286   // c1.val[0]: 01 11 21 31
287   // c1.val[1]: 03 13 23 33
288   // c2.val[0]: 40 50 60 70
289   // c2.val[1]: 42 52 62 72
290   // c3.val[0]: 41 51 61 71
291   // c3.val[1]: 43 53 63 73
292 
293   int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
294                             vreinterpret_s32_s16(b1.val[0]));
295   int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
296                             vreinterpret_s32_s16(b1.val[1]));
297   int32x2x2_t c2 = vtrn_s32(vreinterpret_s32_s16(b2.val[0]),
298                             vreinterpret_s32_s16(b3.val[0]));
299   int32x2x2_t c3 = vtrn_s32(vreinterpret_s32_s16(b2.val[1]),
300                             vreinterpret_s32_s16(b3.val[1]));
301 
302   // Swap 64 bit elements resulting in:
303   // o0: 00 10 20 30 40 50 60 70
304   // o1: 01 11 21 31 41 51 61 71
305   // o2: 02 12 22 32 42 52 62 72
306   // o3: 03 13 23 33 43 53 63 73
307 
308   *o0 = vcombine_s16(vreinterpret_s16_s32(c0.val[0]),
309                      vreinterpret_s16_s32(c2.val[0]));
310   *o1 = vcombine_s16(vreinterpret_s16_s32(c1.val[0]),
311                      vreinterpret_s16_s32(c3.val[0]));
312   *o2 = vcombine_s16(vreinterpret_s16_s32(c0.val[1]),
313                      vreinterpret_s16_s32(c2.val[1]));
314   *o3 = vcombine_s16(vreinterpret_s16_s32(c1.val[1]),
315                      vreinterpret_s16_s32(c3.val[1]));
316 }
317 
transpose_u16_8x8(uint16x8_t * a0,uint16x8_t * a1,uint16x8_t * a2,uint16x8_t * a3,uint16x8_t * a4,uint16x8_t * a5,uint16x8_t * a6,uint16x8_t * a7)318 static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1,
319                                      uint16x8_t *a2, uint16x8_t *a3,
320                                      uint16x8_t *a4, uint16x8_t *a5,
321                                      uint16x8_t *a6, uint16x8_t *a7) {
322   // Swap 16 bit elements. Goes from:
323   // a0: 00 01 02 03 04 05 06 07
324   // a1: 10 11 12 13 14 15 16 17
325   // a2: 20 21 22 23 24 25 26 27
326   // a3: 30 31 32 33 34 35 36 37
327   // a4: 40 41 42 43 44 45 46 47
328   // a5: 50 51 52 53 54 55 56 57
329   // a6: 60 61 62 63 64 65 66 67
330   // a7: 70 71 72 73 74 75 76 77
331   // to:
332   // b0.val[0]: 00 10 02 12 04 14 06 16
333   // b0.val[1]: 01 11 03 13 05 15 07 17
334   // b1.val[0]: 20 30 22 32 24 34 26 36
335   // b1.val[1]: 21 31 23 33 25 35 27 37
336   // b2.val[0]: 40 50 42 52 44 54 46 56
337   // b2.val[1]: 41 51 43 53 45 55 47 57
338   // b3.val[0]: 60 70 62 72 64 74 66 76
339   // b3.val[1]: 61 71 63 73 65 75 67 77
340 
341   const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
342   const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
343   const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5);
344   const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7);
345 
346   // Swap 32 bit elements resulting in:
347   // c0.val[0]: 00 10 20 30 04 14 24 34
348   // c0.val[1]: 02 12 22 32 06 16 26 36
349   // c1.val[0]: 01 11 21 31 05 15 25 35
350   // c1.val[1]: 03 13 23 33 07 17 27 37
351   // c2.val[0]: 40 50 60 70 44 54 64 74
352   // c2.val[1]: 42 52 62 72 46 56 66 76
353   // c3.val[0]: 41 51 61 71 45 55 65 75
354   // c3.val[1]: 43 53 63 73 47 57 67 77
355 
356   const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
357                                     vreinterpretq_u32_u16(b1.val[0]));
358   const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
359                                     vreinterpretq_u32_u16(b1.val[1]));
360   const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]),
361                                     vreinterpretq_u32_u16(b3.val[0]));
362   const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]),
363                                     vreinterpretq_u32_u16(b3.val[1]));
364 
365   *a0 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c0.val[0])),
366                      vget_low_u16(vreinterpretq_u16_u32(c2.val[0])));
367   *a4 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c0.val[0])),
368                      vget_high_u16(vreinterpretq_u16_u32(c2.val[0])));
369 
370   *a2 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c0.val[1])),
371                      vget_low_u16(vreinterpretq_u16_u32(c2.val[1])));
372   *a6 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c0.val[1])),
373                      vget_high_u16(vreinterpretq_u16_u32(c2.val[1])));
374 
375   *a1 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c1.val[0])),
376                      vget_low_u16(vreinterpretq_u16_u32(c3.val[0])));
377   *a5 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c1.val[0])),
378                      vget_high_u16(vreinterpretq_u16_u32(c3.val[0])));
379 
380   *a3 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c1.val[1])),
381                      vget_low_u16(vreinterpretq_u16_u32(c3.val[1])));
382   *a7 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c1.val[1])),
383                      vget_high_u16(vreinterpretq_u16_u32(c3.val[1])));
384 }
385 
transpose_s16_8x8(int16x8_t * a0,int16x8_t * a1,int16x8_t * a2,int16x8_t * a3,int16x8_t * a4,int16x8_t * a5,int16x8_t * a6,int16x8_t * a7)386 static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
387                                      int16x8_t *a2, int16x8_t *a3,
388                                      int16x8_t *a4, int16x8_t *a5,
389                                      int16x8_t *a6, int16x8_t *a7) {
390   // Swap 16 bit elements. Goes from:
391   // a0: 00 01 02 03 04 05 06 07
392   // a1: 10 11 12 13 14 15 16 17
393   // a2: 20 21 22 23 24 25 26 27
394   // a3: 30 31 32 33 34 35 36 37
395   // a4: 40 41 42 43 44 45 46 47
396   // a5: 50 51 52 53 54 55 56 57
397   // a6: 60 61 62 63 64 65 66 67
398   // a7: 70 71 72 73 74 75 76 77
399   // to:
400   // b0.val[0]: 00 10 02 12 04 14 06 16
401   // b0.val[1]: 01 11 03 13 05 15 07 17
402   // b1.val[0]: 20 30 22 32 24 34 26 36
403   // b1.val[1]: 21 31 23 33 25 35 27 37
404   // b2.val[0]: 40 50 42 52 44 54 46 56
405   // b2.val[1]: 41 51 43 53 45 55 47 57
406   // b3.val[0]: 60 70 62 72 64 74 66 76
407   // b3.val[1]: 61 71 63 73 65 75 67 77
408 
409   const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
410   const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
411   const int16x8x2_t b2 = vtrnq_s16(*a4, *a5);
412   const int16x8x2_t b3 = vtrnq_s16(*a6, *a7);
413 
414   // Swap 32 bit elements resulting in:
415   // c0.val[0]: 00 10 20 30 04 14 24 34
416   // c0.val[1]: 02 12 22 32 06 16 26 36
417   // c1.val[0]: 01 11 21 31 05 15 25 35
418   // c1.val[1]: 03 13 23 33 07 17 27 37
419   // c2.val[0]: 40 50 60 70 44 54 64 74
420   // c2.val[1]: 42 52 62 72 46 56 66 76
421   // c3.val[0]: 41 51 61 71 45 55 65 75
422   // c3.val[1]: 43 53 63 73 47 57 67 77
423 
424   const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
425                                    vreinterpretq_s32_s16(b1.val[0]));
426   const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
427                                    vreinterpretq_s32_s16(b1.val[1]));
428   const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
429                                    vreinterpretq_s32_s16(b3.val[0]));
430   const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
431                                    vreinterpretq_s32_s16(b3.val[1]));
432 
433   *a0 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c0.val[0])),
434                      vget_low_s16(vreinterpretq_s16_s32(c2.val[0])));
435   *a4 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c0.val[0])),
436                      vget_high_s16(vreinterpretq_s16_s32(c2.val[0])));
437 
438   *a2 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c0.val[1])),
439                      vget_low_s16(vreinterpretq_s16_s32(c2.val[1])));
440   *a6 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c0.val[1])),
441                      vget_high_s16(vreinterpretq_s16_s32(c2.val[1])));
442 
443   *a1 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c1.val[0])),
444                      vget_low_s16(vreinterpretq_s16_s32(c3.val[0])));
445   *a5 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c1.val[0])),
446                      vget_high_s16(vreinterpretq_s16_s32(c3.val[0])));
447 
448   *a3 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c1.val[1])),
449                      vget_low_s16(vreinterpretq_s16_s32(c3.val[1])));
450   *a7 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c1.val[1])),
451                      vget_high_s16(vreinterpretq_s16_s32(c3.val[1])));
452 }
453 
aom_vtrnq_s64_to_s16(int32x4_t a0,int32x4_t a1)454 static INLINE int16x8x2_t aom_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
455   int16x8x2_t b0;
456   b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
457                            vreinterpret_s16_s32(vget_low_s32(a1)));
458   b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
459                            vreinterpret_s16_s32(vget_high_s32(a1)));
460   return b0;
461 }
462 
transpose_s16_8x8q(int16x8_t * a0,int16x8_t * out)463 static INLINE void transpose_s16_8x8q(int16x8_t *a0, int16x8_t *out) {
464   // Swap 16 bit elements. Goes from:
465   // a0: 00 01 02 03 04 05 06 07
466   // a1: 10 11 12 13 14 15 16 17
467   // a2: 20 21 22 23 24 25 26 27
468   // a3: 30 31 32 33 34 35 36 37
469   // a4: 40 41 42 43 44 45 46 47
470   // a5: 50 51 52 53 54 55 56 57
471   // a6: 60 61 62 63 64 65 66 67
472   // a7: 70 71 72 73 74 75 76 77
473   // to:
474   // b0.val[0]: 00 10 02 12 04 14 06 16
475   // b0.val[1]: 01 11 03 13 05 15 07 17
476   // b1.val[0]: 20 30 22 32 24 34 26 36
477   // b1.val[1]: 21 31 23 33 25 35 27 37
478   // b2.val[0]: 40 50 42 52 44 54 46 56
479   // b2.val[1]: 41 51 43 53 45 55 47 57
480   // b3.val[0]: 60 70 62 72 64 74 66 76
481   // b3.val[1]: 61 71 63 73 65 75 67 77
482 
483   const int16x8x2_t b0 = vtrnq_s16(*a0, *(a0 + 1));
484   const int16x8x2_t b1 = vtrnq_s16(*(a0 + 2), *(a0 + 3));
485   const int16x8x2_t b2 = vtrnq_s16(*(a0 + 4), *(a0 + 5));
486   const int16x8x2_t b3 = vtrnq_s16(*(a0 + 6), *(a0 + 7));
487 
488   // Swap 32 bit elements resulting in:
489   // c0.val[0]: 00 10 20 30 04 14 24 34
490   // c0.val[1]: 02 12 22 32 06 16 26 36
491   // c1.val[0]: 01 11 21 31 05 15 25 35
492   // c1.val[1]: 03 13 23 33 07 17 27 37
493   // c2.val[0]: 40 50 60 70 44 54 64 74
494   // c2.val[1]: 42 52 62 72 46 56 66 76
495   // c3.val[0]: 41 51 61 71 45 55 65 75
496   // c3.val[1]: 43 53 63 73 47 57 67 77
497 
498   const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
499                                    vreinterpretq_s32_s16(b1.val[0]));
500   const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
501                                    vreinterpretq_s32_s16(b1.val[1]));
502   const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
503                                    vreinterpretq_s32_s16(b3.val[0]));
504   const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
505                                    vreinterpretq_s32_s16(b3.val[1]));
506 
507   // Swap 64 bit elements resulting in:
508   // d0.val[0]: 00 10 20 30 40 50 60 70
509   // d0.val[1]: 04 14 24 34 44 54 64 74
510   // d1.val[0]: 01 11 21 31 41 51 61 71
511   // d1.val[1]: 05 15 25 35 45 55 65 75
512   // d2.val[0]: 02 12 22 32 42 52 62 72
513   // d2.val[1]: 06 16 26 36 46 56 66 76
514   // d3.val[0]: 03 13 23 33 43 53 63 73
515   // d3.val[1]: 07 17 27 37 47 57 67 77
516   const int16x8x2_t d0 = aom_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
517   const int16x8x2_t d1 = aom_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
518   const int16x8x2_t d2 = aom_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
519   const int16x8x2_t d3 = aom_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
520 
521   *out = d0.val[0];
522   *(out + 1) = d1.val[0];
523   *(out + 2) = d2.val[0];
524   *(out + 3) = d3.val[0];
525   *(out + 4) = d0.val[1];
526   *(out + 5) = d1.val[1];
527   *(out + 6) = d2.val[1];
528   *(out + 7) = d3.val[1];
529 }
530 
transpose_s16_4x4d(int16x4_t * a0,int16x4_t * a1,int16x4_t * a2,int16x4_t * a3)531 static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1,
532                                       int16x4_t *a2, int16x4_t *a3) {
533   // Swap 16 bit elements. Goes from:
534   // a0: 00 01 02 03
535   // a1: 10 11 12 13
536   // a2: 20 21 22 23
537   // a3: 30 31 32 33
538   // to:
539   // b0.val[0]: 00 10 02 12
540   // b0.val[1]: 01 11 03 13
541   // b1.val[0]: 20 30 22 32
542   // b1.val[1]: 21 31 23 33
543 
544   const int16x4x2_t b0 = vtrn_s16(*a0, *a1);
545   const int16x4x2_t b1 = vtrn_s16(*a2, *a3);
546 
547   // Swap 32 bit elements resulting in:
548   // c0.val[0]: 00 10 20 30
549   // c0.val[1]: 02 12 22 32
550   // c1.val[0]: 01 11 21 31
551   // c1.val[1]: 03 13 23 33
552 
553   const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
554                                   vreinterpret_s32_s16(b1.val[0]));
555   const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
556                                   vreinterpret_s32_s16(b1.val[1]));
557 
558   *a0 = vreinterpret_s16_s32(c0.val[0]);
559   *a1 = vreinterpret_s16_s32(c1.val[0]);
560   *a2 = vreinterpret_s16_s32(c0.val[1]);
561   *a3 = vreinterpret_s16_s32(c1.val[1]);
562 }
563 
aom_vtrnq_s64_to_s32(int32x4_t a0,int32x4_t a1)564 static INLINE int32x4x2_t aom_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
565   int32x4x2_t b0;
566   b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
567   b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
568   return b0;
569 }
570 
transpose_s32_4x4(int32x4_t * a0,int32x4_t * a1,int32x4_t * a2,int32x4_t * a3)571 static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1,
572                                      int32x4_t *a2, int32x4_t *a3) {
573   // Swap 32 bit elements. Goes from:
574   // a0: 00 01 02 03
575   // a1: 10 11 12 13
576   // a2: 20 21 22 23
577   // a3: 30 31 32 33
578   // to:
579   // b0.val[0]: 00 10 02 12
580   // b0.val[1]: 01 11 03 13
581   // b1.val[0]: 20 30 22 32
582   // b1.val[1]: 21 31 23 33
583 
584   const int32x4x2_t b0 = vtrnq_s32(*a0, *a1);
585   const int32x4x2_t b1 = vtrnq_s32(*a2, *a3);
586 
587   // Swap 64 bit elements resulting in:
588   // c0.val[0]: 00 10 20 30
589   // c0.val[1]: 02 12 22 32
590   // c1.val[0]: 01 11 21 31
591   // c1.val[1]: 03 13 23 33
592 
593   const int32x4x2_t c0 = aom_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
594   const int32x4x2_t c1 = aom_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
595 
596   *a0 = c0.val[0];
597   *a1 = c1.val[0];
598   *a2 = c0.val[1];
599   *a3 = c1.val[1];
600 }
601 
602 #endif  // AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_
603