• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #ifndef AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
12 #define AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
13 
14 #include <arm_neon.h>
15 
16 // Swap high and low halves.
transpose64_u16q(const uint16x8_t a)17 static INLINE uint16x8_t transpose64_u16q(const uint16x8_t a) {
18   return vextq_u16(a, a, 4);
19 }
20 
transpose_u8_8x8(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3,uint8x8_t * a4,uint8x8_t * a5,uint8x8_t * a6,uint8x8_t * a7)21 static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
22                                     uint8x8_t *a3, uint8x8_t *a4, uint8x8_t *a5,
23                                     uint8x8_t *a6, uint8x8_t *a7) {
24   // Swap 8 bit elements. Goes from:
25   // a0: 00 01 02 03 04 05 06 07
26   // a1: 10 11 12 13 14 15 16 17
27   // a2: 20 21 22 23 24 25 26 27
28   // a3: 30 31 32 33 34 35 36 37
29   // a4: 40 41 42 43 44 45 46 47
30   // a5: 50 51 52 53 54 55 56 57
31   // a6: 60 61 62 63 64 65 66 67
32   // a7: 70 71 72 73 74 75 76 77
33   // to:
34   // b0.val[0]: 00 10 02 12 04 14 06 16  40 50 42 52 44 54 46 56
35   // b0.val[1]: 01 11 03 13 05 15 07 17  41 51 43 53 45 55 47 57
36   // b1.val[0]: 20 30 22 32 24 34 26 36  60 70 62 72 64 74 66 76
37   // b1.val[1]: 21 31 23 33 25 35 27 37  61 71 63 73 65 75 67 77
38 
39   const uint8x16x2_t b0 =
40       vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5));
41   const uint8x16x2_t b1 =
42       vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7));
43 
44   // Swap 16 bit elements resulting in:
45   // c0.val[0]: 00 10 20 30 04 14 24 34  40 50 60 70 44 54 64 74
46   // c0.val[1]: 02 12 22 32 06 16 26 36  42 52 62 72 46 56 66 76
47   // c1.val[0]: 01 11 21 31 05 15 25 35  41 51 61 71 45 55 65 75
48   // c1.val[1]: 03 13 23 33 07 17 27 37  43 53 63 73 47 57 67 77
49 
50   const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
51                                     vreinterpretq_u16_u8(b1.val[0]));
52   const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
53                                     vreinterpretq_u16_u8(b1.val[1]));
54 
55   // Unzip 32 bit elements resulting in:
56   // d0.val[0]: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
57   // d0.val[1]: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
58   // d1.val[0]: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
59   // d1.val[1]: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
60   const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
61                                     vreinterpretq_u32_u16(c1.val[0]));
62   const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
63                                     vreinterpretq_u32_u16(c1.val[1]));
64 
65   *a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
66   *a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
67   *a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
68   *a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
69   *a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
70   *a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
71   *a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
72   *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
73 }
74 
transpose_u8_8x4(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3)75 static INLINE void transpose_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
76                                     uint8x8_t *a3) {
77   // Swap 8 bit elements. Goes from:
78   // a0: 00 01 02 03 04 05 06 07
79   // a1: 10 11 12 13 14 15 16 17
80   // a2: 20 21 22 23 24 25 26 27
81   // a3: 30 31 32 33 34 35 36 37
82   // to:
83   // b0.val[0]: 00 10 02 12 04 14 06 16
84   // b0.val[1]: 01 11 03 13 05 15 07 17
85   // b1.val[0]: 20 30 22 32 24 34 26 36
86   // b1.val[1]: 21 31 23 33 25 35 27 37
87 
88   const uint8x8x2_t b0 = vtrn_u8(*a0, *a1);
89   const uint8x8x2_t b1 = vtrn_u8(*a2, *a3);
90 
91   // Swap 16 bit elements resulting in:
92   // c0.val[0]: 00 10 20 30 04 14 24 34
93   // c0.val[1]: 02 12 22 32 06 16 26 36
94   // c1.val[0]: 01 11 21 31 05 15 25 35
95   // c1.val[1]: 03 13 23 33 07 17 27 37
96 
97   const uint16x4x2_t c0 =
98       vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0]));
99   const uint16x4x2_t c1 =
100       vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1]));
101 
102   *a0 = vreinterpret_u8_u16(c0.val[0]);
103   *a1 = vreinterpret_u8_u16(c1.val[0]);
104   *a2 = vreinterpret_u8_u16(c0.val[1]);
105   *a3 = vreinterpret_u8_u16(c1.val[1]);
106 }
107 
transpose_u8_4x4(uint8x8_t * a0,uint8x8_t * a1)108 static INLINE void transpose_u8_4x4(uint8x8_t *a0, uint8x8_t *a1) {
109   // Swap 16 bit elements. Goes from:
110   // a0: 00 01 02 03  10 11 12 13
111   // a1: 20 21 22 23  30 31 32 33
112   // to:
113   // b0.val[0]: 00 01 20 21  10 11 30 31
114   // b0.val[1]: 02 03 22 23  12 13 32 33
115 
116   const uint16x4x2_t b0 =
117       vtrn_u16(vreinterpret_u16_u8(*a0), vreinterpret_u16_u8(*a1));
118 
119   // Swap 32 bit elements resulting in:
120   // c0.val[0]: 00 01 20 21  02 03 22 23
121   // c0.val[1]: 10 11 30 31  12 13 32 33
122 
123   const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
124                                    vreinterpret_u32_u16(b0.val[1]));
125 
126   // Swap 8 bit elements resulting in:
127   // d0.val[0]: 00 10 20 30  02 12 22 32
128   // d0.val[1]: 01 11 21 31  03 13 23 33
129 
130   const uint8x8x2_t d0 =
131       vtrn_u8(vreinterpret_u8_u32(c0.val[0]), vreinterpret_u8_u32(c0.val[1]));
132 
133   *a0 = d0.val[0];
134   *a1 = d0.val[1];
135 }
136 
transpose_u8_4x8(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3,const uint8x8_t a4,const uint8x8_t a5,const uint8x8_t a6,const uint8x8_t a7)137 static INLINE void transpose_u8_4x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
138                                     uint8x8_t *a3, const uint8x8_t a4,
139                                     const uint8x8_t a5, const uint8x8_t a6,
140                                     const uint8x8_t a7) {
141   // Swap 32 bit elements. Goes from:
142   // a0: 00 01 02 03 XX XX XX XX
143   // a1: 10 11 12 13 XX XX XX XX
144   // a2: 20 21 22 23 XX XX XX XX
145   // a3; 30 31 32 33 XX XX XX XX
146   // a4: 40 41 42 43 XX XX XX XX
147   // a5: 50 51 52 53 XX XX XX XX
148   // a6: 60 61 62 63 XX XX XX XX
149   // a7: 70 71 72 73 XX XX XX XX
150   // to:
151   // b0.val[0]: 00 01 02 03 40 41 42 43
152   // b1.val[0]: 10 11 12 13 50 51 52 53
153   // b2.val[0]: 20 21 22 23 60 61 62 63
154   // b3.val[0]: 30 31 32 33 70 71 72 73
155 
156   const uint32x2x2_t b0 =
157       vtrn_u32(vreinterpret_u32_u8(*a0), vreinterpret_u32_u8(a4));
158   const uint32x2x2_t b1 =
159       vtrn_u32(vreinterpret_u32_u8(*a1), vreinterpret_u32_u8(a5));
160   const uint32x2x2_t b2 =
161       vtrn_u32(vreinterpret_u32_u8(*a2), vreinterpret_u32_u8(a6));
162   const uint32x2x2_t b3 =
163       vtrn_u32(vreinterpret_u32_u8(*a3), vreinterpret_u32_u8(a7));
164 
165   // Swap 16 bit elements resulting in:
166   // c0.val[0]: 00 01 20 21 40 41 60 61
167   // c0.val[1]: 02 03 22 23 42 43 62 63
168   // c1.val[0]: 10 11 30 31 50 51 70 71
169   // c1.val[1]: 12 13 32 33 52 53 72 73
170 
171   const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]),
172                                    vreinterpret_u16_u32(b2.val[0]));
173   const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]),
174                                    vreinterpret_u16_u32(b3.val[0]));
175 
176   // Swap 8 bit elements resulting in:
177   // d0.val[0]: 00 10 20 30 40 50 60 70
178   // d0.val[1]: 01 11 21 31 41 51 61 71
179   // d1.val[0]: 02 12 22 32 42 52 62 72
180   // d1.val[1]: 03 13 23 33 43 53 63 73
181 
182   const uint8x8x2_t d0 =
183       vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0]));
184   const uint8x8x2_t d1 =
185       vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1]));
186 
187   *a0 = d0.val[0];
188   *a1 = d0.val[1];
189   *a2 = d1.val[0];
190   *a3 = d1.val[1];
191 }
192 
193 // Input:
194 // 00 01 02 03
195 // 10 11 12 13
196 // 20 21 22 23
197 // 30 31 32 33
198 // Output:
199 // 00 10 20 30
200 // 01 11 21 31
201 // 02 12 22 32
202 // 03 13 23 33
transpose_u16_4x4(uint16x4_t a[4])203 static INLINE void transpose_u16_4x4(uint16x4_t a[4]) {
204   // b:
205   // 00 10 02 12
206   // 01 11 03 13
207   const uint16x4x2_t b = vtrn_u16(a[0], a[1]);
208   // c:
209   // 20 30 22 32
210   // 21 31 23 33
211   const uint16x4x2_t c = vtrn_u16(a[2], a[3]);
212   // d:
213   // 00 10 20 30
214   // 02 12 22 32
215   const uint32x2x2_t d =
216       vtrn_u32(vreinterpret_u32_u16(b.val[0]), vreinterpret_u32_u16(c.val[0]));
217   // e:
218   // 01 11 21 31
219   // 03 13 23 33
220   const uint32x2x2_t e =
221       vtrn_u32(vreinterpret_u32_u16(b.val[1]), vreinterpret_u32_u16(c.val[1]));
222   a[0] = vreinterpret_u16_u32(d.val[0]);
223   a[1] = vreinterpret_u16_u32(e.val[0]);
224   a[2] = vreinterpret_u16_u32(d.val[1]);
225   a[3] = vreinterpret_u16_u32(e.val[1]);
226 }
227 
228 // 4x8 Input:
229 // a[0]: 00 01 02 03 04 05 06 07
230 // a[1]: 10 11 12 13 14 15 16 17
231 // a[2]: 20 21 22 23 24 25 26 27
232 // a[3]: 30 31 32 33 34 35 36 37
233 // 8x4 Output:
234 // a[0]: 00 10 20 30 04 14 24 34
235 // a[1]: 01 11 21 31 05 15 25 35
236 // a[2]: 02 12 22 32 06 16 26 36
237 // a[3]: 03 13 23 33 07 17 27 37
transpose_u16_4x8q(uint16x8_t a[4])238 static INLINE void transpose_u16_4x8q(uint16x8_t a[4]) {
239   // b0.val[0]: 00 10 02 12 04 14 06 16
240   // b0.val[1]: 01 11 03 13 05 15 07 17
241   // b1.val[0]: 20 30 22 32 24 34 26 36
242   // b1.val[1]: 21 31 23 33 25 35 27 37
243   const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
244   const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
245 
246   // c0.val[0]: 00 10 20 30 04 14 24 34
247   // c0.val[1]: 02 12 22 32 06 16 26 36
248   // c1.val[0]: 01 11 21 31 05 15 25 35
249   // c1.val[1]: 03 13 23 33 07 17 27 37
250   const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
251                                     vreinterpretq_u32_u16(b1.val[0]));
252   const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
253                                     vreinterpretq_u32_u16(b1.val[1]));
254 
255   a[0] = vreinterpretq_u16_u32(c0.val[0]);
256   a[1] = vreinterpretq_u16_u32(c1.val[0]);
257   a[2] = vreinterpretq_u16_u32(c0.val[1]);
258   a[3] = vreinterpretq_u16_u32(c1.val[1]);
259 }
260 
aom_vtrnq_u64_to_u16(const uint32x4_t a0,const uint32x4_t a1)261 static INLINE uint16x8x2_t aom_vtrnq_u64_to_u16(const uint32x4_t a0,
262                                                 const uint32x4_t a1) {
263   uint16x8x2_t b0;
264   b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
265                            vreinterpret_u16_u32(vget_low_u32(a1)));
266   b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
267                            vreinterpret_u16_u32(vget_high_u32(a1)));
268   return b0;
269 }
270 
271 // Special transpose for loop filter.
272 // 4x8 Input:
273 // p_q:  p3 p2 p1 p0 q0 q1 q2 q3
274 // a[0]: 00 01 02 03 04 05 06 07
275 // a[1]: 10 11 12 13 14 15 16 17
276 // a[2]: 20 21 22 23 24 25 26 27
277 // a[3]: 30 31 32 33 34 35 36 37
278 // 8x4 Output:
279 // a[0]: 03 13 23 33 04 14 24 34  p0q0
280 // a[1]: 02 12 22 32 05 15 25 35  p1q1
281 // a[2]: 01 11 21 31 06 16 26 36  p2q2
282 // a[3]: 00 10 20 30 07 17 27 37  p3q3
283 // Direct reapplication of the function will reset the high halves, but
284 // reverse the low halves:
285 // p_q:  p0 p1 p2 p3 q0 q1 q2 q3
286 // a[0]: 33 32 31 30 04 05 06 07
287 // a[1]: 23 22 21 20 14 15 16 17
288 // a[2]: 13 12 11 10 24 25 26 27
289 // a[3]: 03 02 01 00 34 35 36 37
290 // Simply reordering the inputs (3, 2, 1, 0) will reset the low halves, but
291 // reverse the high halves.
292 // The standard transpose_u16_4x8q will produce the same reversals, but with the
293 // order of the low halves also restored relative to the high halves. This is
294 // preferable because it puts all values from the same source row back together,
295 // but some post-processing is inevitable.
loop_filter_transpose_u16_4x8q(uint16x8_t a[4])296 static INLINE void loop_filter_transpose_u16_4x8q(uint16x8_t a[4]) {
297   // b0.val[0]: 00 10 02 12 04 14 06 16
298   // b0.val[1]: 01 11 03 13 05 15 07 17
299   // b1.val[0]: 20 30 22 32 24 34 26 36
300   // b1.val[1]: 21 31 23 33 25 35 27 37
301   const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
302   const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
303 
304   // Reverse odd vectors to bring the appropriate items to the front of zips.
305   // b0.val[0]: 00 10 02 12 04 14 06 16
306   // r0       : 03 13 01 11 07 17 05 15
307   // b1.val[0]: 20 30 22 32 24 34 26 36
308   // r1       : 23 33 21 31 27 37 25 35
309   const uint32x4_t r0 = vrev64q_u32(vreinterpretq_u32_u16(b0.val[1]));
310   const uint32x4_t r1 = vrev64q_u32(vreinterpretq_u32_u16(b1.val[1]));
311 
312   // Zip to complete the halves.
313   // c0.val[0]: 00 10 20 30 02 12 22 32  p3p1
314   // c0.val[1]: 04 14 24 34 06 16 26 36  q0q2
315   // c1.val[0]: 03 13 23 33 01 11 21 31  p0p2
316   // c1.val[1]: 07 17 27 37 05 15 25 35  q3q1
317   const uint32x4x2_t c0 = vzipq_u32(vreinterpretq_u32_u16(b0.val[0]),
318                                     vreinterpretq_u32_u16(b1.val[0]));
319   const uint32x4x2_t c1 = vzipq_u32(r0, r1);
320 
321   // d0.val[0]: 00 10 20 30 07 17 27 37  p3q3
322   // d0.val[1]: 02 12 22 32 05 15 25 35  p1q1
323   // d1.val[0]: 03 13 23 33 04 14 24 34  p0q0
324   // d1.val[1]: 01 11 21 31 06 16 26 36  p2q2
325   const uint16x8x2_t d0 = aom_vtrnq_u64_to_u16(c0.val[0], c1.val[1]);
326   // The third row of c comes first here to swap p2 with q0.
327   const uint16x8x2_t d1 = aom_vtrnq_u64_to_u16(c1.val[0], c0.val[1]);
328 
329   // 8x4 Output:
330   // a[0]: 03 13 23 33 04 14 24 34  p0q0
331   // a[1]: 02 12 22 32 05 15 25 35  p1q1
332   // a[2]: 01 11 21 31 06 16 26 36  p2q2
333   // a[3]: 00 10 20 30 07 17 27 37  p3q3
334   a[0] = d1.val[0];  // p0q0
335   a[1] = d0.val[1];  // p1q1
336   a[2] = d1.val[1];  // p2q2
337   a[3] = d0.val[0];  // p3q3
338 }
339 
transpose_u16_4x8(uint16x4_t * a0,uint16x4_t * a1,uint16x4_t * a2,uint16x4_t * a3,uint16x4_t * a4,uint16x4_t * a5,uint16x4_t * a6,uint16x4_t * a7,uint16x8_t * o0,uint16x8_t * o1,uint16x8_t * o2,uint16x8_t * o3)340 static INLINE void transpose_u16_4x8(uint16x4_t *a0, uint16x4_t *a1,
341                                      uint16x4_t *a2, uint16x4_t *a3,
342                                      uint16x4_t *a4, uint16x4_t *a5,
343                                      uint16x4_t *a6, uint16x4_t *a7,
344                                      uint16x8_t *o0, uint16x8_t *o1,
345                                      uint16x8_t *o2, uint16x8_t *o3) {
346   // Swap 16 bit elements. Goes from:
347   // a0: 00 01 02 03
348   // a1: 10 11 12 13
349   // a2: 20 21 22 23
350   // a3: 30 31 32 33
351   // a4: 40 41 42 43
352   // a5: 50 51 52 53
353   // a6: 60 61 62 63
354   // a7: 70 71 72 73
355   // to:
356   // b0.val[0]: 00 10 02 12
357   // b0.val[1]: 01 11 03 13
358   // b1.val[0]: 20 30 22 32
359   // b1.val[1]: 21 31 23 33
360   // b2.val[0]: 40 50 42 52
361   // b2.val[1]: 41 51 43 53
362   // b3.val[0]: 60 70 62 72
363   // b3.val[1]: 61 71 63 73
364 
365   uint16x4x2_t b0 = vtrn_u16(*a0, *a1);
366   uint16x4x2_t b1 = vtrn_u16(*a2, *a3);
367   uint16x4x2_t b2 = vtrn_u16(*a4, *a5);
368   uint16x4x2_t b3 = vtrn_u16(*a6, *a7);
369 
370   // Swap 32 bit elements resulting in:
371   // c0.val[0]: 00 10 20 30
372   // c0.val[1]: 02 12 22 32
373   // c1.val[0]: 01 11 21 31
374   // c1.val[1]: 03 13 23 33
375   // c2.val[0]: 40 50 60 70
376   // c2.val[1]: 42 52 62 72
377   // c3.val[0]: 41 51 61 71
378   // c3.val[1]: 43 53 63 73
379 
380   uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
381                              vreinterpret_u32_u16(b1.val[0]));
382   uint32x2x2_t c1 = vtrn_u32(vreinterpret_u32_u16(b0.val[1]),
383                              vreinterpret_u32_u16(b1.val[1]));
384   uint32x2x2_t c2 = vtrn_u32(vreinterpret_u32_u16(b2.val[0]),
385                              vreinterpret_u32_u16(b3.val[0]));
386   uint32x2x2_t c3 = vtrn_u32(vreinterpret_u32_u16(b2.val[1]),
387                              vreinterpret_u32_u16(b3.val[1]));
388 
389   // Swap 64 bit elements resulting in:
390   // o0: 00 10 20 30 40 50 60 70
391   // o1: 01 11 21 31 41 51 61 71
392   // o2: 02 12 22 32 42 52 62 72
393   // o3: 03 13 23 33 43 53 63 73
394 
395   *o0 = vcombine_u16(vreinterpret_u16_u32(c0.val[0]),
396                      vreinterpret_u16_u32(c2.val[0]));
397   *o1 = vcombine_u16(vreinterpret_u16_u32(c1.val[0]),
398                      vreinterpret_u16_u32(c3.val[0]));
399   *o2 = vcombine_u16(vreinterpret_u16_u32(c0.val[1]),
400                      vreinterpret_u16_u32(c2.val[1]));
401   *o3 = vcombine_u16(vreinterpret_u16_u32(c1.val[1]),
402                      vreinterpret_u16_u32(c3.val[1]));
403 }
404 
transpose_s16_4x8(int16x4_t * a0,int16x4_t * a1,int16x4_t * a2,int16x4_t * a3,int16x4_t * a4,int16x4_t * a5,int16x4_t * a6,int16x4_t * a7,int16x8_t * o0,int16x8_t * o1,int16x8_t * o2,int16x8_t * o3)405 static INLINE void transpose_s16_4x8(int16x4_t *a0, int16x4_t *a1,
406                                      int16x4_t *a2, int16x4_t *a3,
407                                      int16x4_t *a4, int16x4_t *a5,
408                                      int16x4_t *a6, int16x4_t *a7,
409                                      int16x8_t *o0, int16x8_t *o1,
410                                      int16x8_t *o2, int16x8_t *o3) {
411   // Swap 16 bit elements. Goes from:
412   // a0: 00 01 02 03
413   // a1: 10 11 12 13
414   // a2: 20 21 22 23
415   // a3: 30 31 32 33
416   // a4: 40 41 42 43
417   // a5: 50 51 52 53
418   // a6: 60 61 62 63
419   // a7: 70 71 72 73
420   // to:
421   // b0.val[0]: 00 10 02 12
422   // b0.val[1]: 01 11 03 13
423   // b1.val[0]: 20 30 22 32
424   // b1.val[1]: 21 31 23 33
425   // b2.val[0]: 40 50 42 52
426   // b2.val[1]: 41 51 43 53
427   // b3.val[0]: 60 70 62 72
428   // b3.val[1]: 61 71 63 73
429 
430   int16x4x2_t b0 = vtrn_s16(*a0, *a1);
431   int16x4x2_t b1 = vtrn_s16(*a2, *a3);
432   int16x4x2_t b2 = vtrn_s16(*a4, *a5);
433   int16x4x2_t b3 = vtrn_s16(*a6, *a7);
434 
435   // Swap 32 bit elements resulting in:
436   // c0.val[0]: 00 10 20 30
437   // c0.val[1]: 02 12 22 32
438   // c1.val[0]: 01 11 21 31
439   // c1.val[1]: 03 13 23 33
440   // c2.val[0]: 40 50 60 70
441   // c2.val[1]: 42 52 62 72
442   // c3.val[0]: 41 51 61 71
443   // c3.val[1]: 43 53 63 73
444 
445   int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
446                             vreinterpret_s32_s16(b1.val[0]));
447   int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
448                             vreinterpret_s32_s16(b1.val[1]));
449   int32x2x2_t c2 = vtrn_s32(vreinterpret_s32_s16(b2.val[0]),
450                             vreinterpret_s32_s16(b3.val[0]));
451   int32x2x2_t c3 = vtrn_s32(vreinterpret_s32_s16(b2.val[1]),
452                             vreinterpret_s32_s16(b3.val[1]));
453 
454   // Swap 64 bit elements resulting in:
455   // o0: 00 10 20 30 40 50 60 70
456   // o1: 01 11 21 31 41 51 61 71
457   // o2: 02 12 22 32 42 52 62 72
458   // o3: 03 13 23 33 43 53 63 73
459 
460   *o0 = vcombine_s16(vreinterpret_s16_s32(c0.val[0]),
461                      vreinterpret_s16_s32(c2.val[0]));
462   *o1 = vcombine_s16(vreinterpret_s16_s32(c1.val[0]),
463                      vreinterpret_s16_s32(c3.val[0]));
464   *o2 = vcombine_s16(vreinterpret_s16_s32(c0.val[1]),
465                      vreinterpret_s16_s32(c2.val[1]));
466   *o3 = vcombine_s16(vreinterpret_s16_s32(c1.val[1]),
467                      vreinterpret_s16_s32(c3.val[1]));
468 }
469 
transpose_u16_8x8(uint16x8_t * a0,uint16x8_t * a1,uint16x8_t * a2,uint16x8_t * a3,uint16x8_t * a4,uint16x8_t * a5,uint16x8_t * a6,uint16x8_t * a7)470 static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1,
471                                      uint16x8_t *a2, uint16x8_t *a3,
472                                      uint16x8_t *a4, uint16x8_t *a5,
473                                      uint16x8_t *a6, uint16x8_t *a7) {
474   // Swap 16 bit elements. Goes from:
475   // a0: 00 01 02 03 04 05 06 07
476   // a1: 10 11 12 13 14 15 16 17
477   // a2: 20 21 22 23 24 25 26 27
478   // a3: 30 31 32 33 34 35 36 37
479   // a4: 40 41 42 43 44 45 46 47
480   // a5: 50 51 52 53 54 55 56 57
481   // a6: 60 61 62 63 64 65 66 67
482   // a7: 70 71 72 73 74 75 76 77
483   // to:
484   // b0.val[0]: 00 10 02 12 04 14 06 16
485   // b0.val[1]: 01 11 03 13 05 15 07 17
486   // b1.val[0]: 20 30 22 32 24 34 26 36
487   // b1.val[1]: 21 31 23 33 25 35 27 37
488   // b2.val[0]: 40 50 42 52 44 54 46 56
489   // b2.val[1]: 41 51 43 53 45 55 47 57
490   // b3.val[0]: 60 70 62 72 64 74 66 76
491   // b3.val[1]: 61 71 63 73 65 75 67 77
492 
493   const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
494   const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
495   const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5);
496   const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7);
497 
498   // Swap 32 bit elements resulting in:
499   // c0.val[0]: 00 10 20 30 04 14 24 34
500   // c0.val[1]: 02 12 22 32 06 16 26 36
501   // c1.val[0]: 01 11 21 31 05 15 25 35
502   // c1.val[1]: 03 13 23 33 07 17 27 37
503   // c2.val[0]: 40 50 60 70 44 54 64 74
504   // c2.val[1]: 42 52 62 72 46 56 66 76
505   // c3.val[0]: 41 51 61 71 45 55 65 75
506   // c3.val[1]: 43 53 63 73 47 57 67 77
507 
508   const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
509                                     vreinterpretq_u32_u16(b1.val[0]));
510   const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
511                                     vreinterpretq_u32_u16(b1.val[1]));
512   const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]),
513                                     vreinterpretq_u32_u16(b3.val[0]));
514   const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]),
515                                     vreinterpretq_u32_u16(b3.val[1]));
516 
517   *a0 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c0.val[0])),
518                      vget_low_u16(vreinterpretq_u16_u32(c2.val[0])));
519   *a4 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c0.val[0])),
520                      vget_high_u16(vreinterpretq_u16_u32(c2.val[0])));
521 
522   *a2 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c0.val[1])),
523                      vget_low_u16(vreinterpretq_u16_u32(c2.val[1])));
524   *a6 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c0.val[1])),
525                      vget_high_u16(vreinterpretq_u16_u32(c2.val[1])));
526 
527   *a1 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c1.val[0])),
528                      vget_low_u16(vreinterpretq_u16_u32(c3.val[0])));
529   *a5 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c1.val[0])),
530                      vget_high_u16(vreinterpretq_u16_u32(c3.val[0])));
531 
532   *a3 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c1.val[1])),
533                      vget_low_u16(vreinterpretq_u16_u32(c3.val[1])));
534   *a7 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c1.val[1])),
535                      vget_high_u16(vreinterpretq_u16_u32(c3.val[1])));
536 }
537 
transpose_s16_8x8(int16x8_t * a0,int16x8_t * a1,int16x8_t * a2,int16x8_t * a3,int16x8_t * a4,int16x8_t * a5,int16x8_t * a6,int16x8_t * a7)538 static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
539                                      int16x8_t *a2, int16x8_t *a3,
540                                      int16x8_t *a4, int16x8_t *a5,
541                                      int16x8_t *a6, int16x8_t *a7) {
542   // Swap 16 bit elements. Goes from:
543   // a0: 00 01 02 03 04 05 06 07
544   // a1: 10 11 12 13 14 15 16 17
545   // a2: 20 21 22 23 24 25 26 27
546   // a3: 30 31 32 33 34 35 36 37
547   // a4: 40 41 42 43 44 45 46 47
548   // a5: 50 51 52 53 54 55 56 57
549   // a6: 60 61 62 63 64 65 66 67
550   // a7: 70 71 72 73 74 75 76 77
551   // to:
552   // b0.val[0]: 00 10 02 12 04 14 06 16
553   // b0.val[1]: 01 11 03 13 05 15 07 17
554   // b1.val[0]: 20 30 22 32 24 34 26 36
555   // b1.val[1]: 21 31 23 33 25 35 27 37
556   // b2.val[0]: 40 50 42 52 44 54 46 56
557   // b2.val[1]: 41 51 43 53 45 55 47 57
558   // b3.val[0]: 60 70 62 72 64 74 66 76
559   // b3.val[1]: 61 71 63 73 65 75 67 77
560 
561   const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
562   const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
563   const int16x8x2_t b2 = vtrnq_s16(*a4, *a5);
564   const int16x8x2_t b3 = vtrnq_s16(*a6, *a7);
565 
566   // Swap 32 bit elements resulting in:
567   // c0.val[0]: 00 10 20 30 04 14 24 34
568   // c0.val[1]: 02 12 22 32 06 16 26 36
569   // c1.val[0]: 01 11 21 31 05 15 25 35
570   // c1.val[1]: 03 13 23 33 07 17 27 37
571   // c2.val[0]: 40 50 60 70 44 54 64 74
572   // c2.val[1]: 42 52 62 72 46 56 66 76
573   // c3.val[0]: 41 51 61 71 45 55 65 75
574   // c3.val[1]: 43 53 63 73 47 57 67 77
575 
576   const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
577                                    vreinterpretq_s32_s16(b1.val[0]));
578   const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
579                                    vreinterpretq_s32_s16(b1.val[1]));
580   const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
581                                    vreinterpretq_s32_s16(b3.val[0]));
582   const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
583                                    vreinterpretq_s32_s16(b3.val[1]));
584 
585   *a0 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c0.val[0])),
586                      vget_low_s16(vreinterpretq_s16_s32(c2.val[0])));
587   *a4 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c0.val[0])),
588                      vget_high_s16(vreinterpretq_s16_s32(c2.val[0])));
589 
590   *a2 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c0.val[1])),
591                      vget_low_s16(vreinterpretq_s16_s32(c2.val[1])));
592   *a6 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c0.val[1])),
593                      vget_high_s16(vreinterpretq_s16_s32(c2.val[1])));
594 
595   *a1 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c1.val[0])),
596                      vget_low_s16(vreinterpretq_s16_s32(c3.val[0])));
597   *a5 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c1.val[0])),
598                      vget_high_s16(vreinterpretq_s16_s32(c3.val[0])));
599 
600   *a3 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c1.val[1])),
601                      vget_low_s16(vreinterpretq_s16_s32(c3.val[1])));
602   *a7 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c1.val[1])),
603                      vget_high_s16(vreinterpretq_s16_s32(c3.val[1])));
604 }
605 
aom_vtrnq_s64_to_s16(int32x4_t a0,int32x4_t a1)606 static INLINE int16x8x2_t aom_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
607   int16x8x2_t b0;
608   b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
609                            vreinterpret_s16_s32(vget_low_s32(a1)));
610   b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
611                            vreinterpret_s16_s32(vget_high_s32(a1)));
612   return b0;
613 }
614 
transpose_s16_8x8q(int16x8_t * a0,int16x8_t * out)615 static INLINE void transpose_s16_8x8q(int16x8_t *a0, int16x8_t *out) {
616   // Swap 16 bit elements. Goes from:
617   // a0: 00 01 02 03 04 05 06 07
618   // a1: 10 11 12 13 14 15 16 17
619   // a2: 20 21 22 23 24 25 26 27
620   // a3: 30 31 32 33 34 35 36 37
621   // a4: 40 41 42 43 44 45 46 47
622   // a5: 50 51 52 53 54 55 56 57
623   // a6: 60 61 62 63 64 65 66 67
624   // a7: 70 71 72 73 74 75 76 77
625   // to:
626   // b0.val[0]: 00 10 02 12 04 14 06 16
627   // b0.val[1]: 01 11 03 13 05 15 07 17
628   // b1.val[0]: 20 30 22 32 24 34 26 36
629   // b1.val[1]: 21 31 23 33 25 35 27 37
630   // b2.val[0]: 40 50 42 52 44 54 46 56
631   // b2.val[1]: 41 51 43 53 45 55 47 57
632   // b3.val[0]: 60 70 62 72 64 74 66 76
633   // b3.val[1]: 61 71 63 73 65 75 67 77
634 
635   const int16x8x2_t b0 = vtrnq_s16(*a0, *(a0 + 1));
636   const int16x8x2_t b1 = vtrnq_s16(*(a0 + 2), *(a0 + 3));
637   const int16x8x2_t b2 = vtrnq_s16(*(a0 + 4), *(a0 + 5));
638   const int16x8x2_t b3 = vtrnq_s16(*(a0 + 6), *(a0 + 7));
639 
640   // Swap 32 bit elements resulting in:
641   // c0.val[0]: 00 10 20 30 04 14 24 34
642   // c0.val[1]: 02 12 22 32 06 16 26 36
643   // c1.val[0]: 01 11 21 31 05 15 25 35
644   // c1.val[1]: 03 13 23 33 07 17 27 37
645   // c2.val[0]: 40 50 60 70 44 54 64 74
646   // c2.val[1]: 42 52 62 72 46 56 66 76
647   // c3.val[0]: 41 51 61 71 45 55 65 75
648   // c3.val[1]: 43 53 63 73 47 57 67 77
649 
650   const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
651                                    vreinterpretq_s32_s16(b1.val[0]));
652   const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
653                                    vreinterpretq_s32_s16(b1.val[1]));
654   const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
655                                    vreinterpretq_s32_s16(b3.val[0]));
656   const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
657                                    vreinterpretq_s32_s16(b3.val[1]));
658 
659   // Swap 64 bit elements resulting in:
660   // d0.val[0]: 00 10 20 30 40 50 60 70
661   // d0.val[1]: 04 14 24 34 44 54 64 74
662   // d1.val[0]: 01 11 21 31 41 51 61 71
663   // d1.val[1]: 05 15 25 35 45 55 65 75
664   // d2.val[0]: 02 12 22 32 42 52 62 72
665   // d2.val[1]: 06 16 26 36 46 56 66 76
666   // d3.val[0]: 03 13 23 33 43 53 63 73
667   // d3.val[1]: 07 17 27 37 47 57 67 77
668   const int16x8x2_t d0 = aom_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
669   const int16x8x2_t d1 = aom_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
670   const int16x8x2_t d2 = aom_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
671   const int16x8x2_t d3 = aom_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
672 
673   *out = d0.val[0];
674   *(out + 1) = d1.val[0];
675   *(out + 2) = d2.val[0];
676   *(out + 3) = d3.val[0];
677   *(out + 4) = d0.val[1];
678   *(out + 5) = d1.val[1];
679   *(out + 6) = d2.val[1];
680   *(out + 7) = d3.val[1];
681 }
682 
transpose_s16_4x4d(int16x4_t * a0,int16x4_t * a1,int16x4_t * a2,int16x4_t * a3)683 static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1,
684                                       int16x4_t *a2, int16x4_t *a3) {
685   // Swap 16 bit elements. Goes from:
686   // a0: 00 01 02 03
687   // a1: 10 11 12 13
688   // a2: 20 21 22 23
689   // a3: 30 31 32 33
690   // to:
691   // b0.val[0]: 00 10 02 12
692   // b0.val[1]: 01 11 03 13
693   // b1.val[0]: 20 30 22 32
694   // b1.val[1]: 21 31 23 33
695 
696   const int16x4x2_t b0 = vtrn_s16(*a0, *a1);
697   const int16x4x2_t b1 = vtrn_s16(*a2, *a3);
698 
699   // Swap 32 bit elements resulting in:
700   // c0.val[0]: 00 10 20 30
701   // c0.val[1]: 02 12 22 32
702   // c1.val[0]: 01 11 21 31
703   // c1.val[1]: 03 13 23 33
704 
705   const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
706                                   vreinterpret_s32_s16(b1.val[0]));
707   const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
708                                   vreinterpret_s32_s16(b1.val[1]));
709 
710   *a0 = vreinterpret_s16_s32(c0.val[0]);
711   *a1 = vreinterpret_s16_s32(c1.val[0]);
712   *a2 = vreinterpret_s16_s32(c0.val[1]);
713   *a3 = vreinterpret_s16_s32(c1.val[1]);
714 }
715 
aom_vtrnq_s64_to_s32(int32x4_t a0,int32x4_t a1)716 static INLINE int32x4x2_t aom_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
717   int32x4x2_t b0;
718   b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
719   b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
720   return b0;
721 }
722 
transpose_s32_4x4(int32x4_t * a0,int32x4_t * a1,int32x4_t * a2,int32x4_t * a3)723 static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1,
724                                      int32x4_t *a2, int32x4_t *a3) {
725   // Swap 32 bit elements. Goes from:
726   // a0: 00 01 02 03
727   // a1: 10 11 12 13
728   // a2: 20 21 22 23
729   // a3: 30 31 32 33
730   // to:
731   // b0.val[0]: 00 10 02 12
732   // b0.val[1]: 01 11 03 13
733   // b1.val[0]: 20 30 22 32
734   // b1.val[1]: 21 31 23 33
735 
736   const int32x4x2_t b0 = vtrnq_s32(*a0, *a1);
737   const int32x4x2_t b1 = vtrnq_s32(*a2, *a3);
738 
739   // Swap 64 bit elements resulting in:
740   // c0.val[0]: 00 10 20 30
741   // c0.val[1]: 02 12 22 32
742   // c1.val[0]: 01 11 21 31
743   // c1.val[1]: 03 13 23 33
744 
745   const int32x4x2_t c0 = aom_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
746   const int32x4x2_t c1 = aom_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
747 
748   *a0 = c0.val[0];
749   *a1 = c1.val[0];
750   *a2 = c0.val[1];
751   *a3 = c1.val[1];
752 }
753 
754 #endif  // AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
755