• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #ifndef AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
12 #define AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
13 
14 #include <arm_neon.h>
15 
16 #include "aom/aom_integer.h"  // For AOM_FORCE_INLINE.
17 #include "config/aom_config.h"
18 
transpose_elems_u8_8x8(uint8x8_t a0,uint8x8_t a1,uint8x8_t a2,uint8x8_t a3,uint8x8_t a4,uint8x8_t a5,uint8x8_t a6,uint8x8_t a7,uint8x8_t * o0,uint8x8_t * o1,uint8x8_t * o2,uint8x8_t * o3,uint8x8_t * o4,uint8x8_t * o5,uint8x8_t * o6,uint8x8_t * o7)19 static INLINE void transpose_elems_u8_8x8(
20     uint8x8_t a0, uint8x8_t a1, uint8x8_t a2, uint8x8_t a3, uint8x8_t a4,
21     uint8x8_t a5, uint8x8_t a6, uint8x8_t a7, uint8x8_t *o0, uint8x8_t *o1,
22     uint8x8_t *o2, uint8x8_t *o3, uint8x8_t *o4, uint8x8_t *o5, uint8x8_t *o6,
23     uint8x8_t *o7) {
24   // Swap 8 bit elements. Goes from:
25   // a0: 00 01 02 03 04 05 06 07
26   // a1: 10 11 12 13 14 15 16 17
27   // a2: 20 21 22 23 24 25 26 27
28   // a3: 30 31 32 33 34 35 36 37
29   // a4: 40 41 42 43 44 45 46 47
30   // a5: 50 51 52 53 54 55 56 57
31   // a6: 60 61 62 63 64 65 66 67
32   // a7: 70 71 72 73 74 75 76 77
33   // to:
34   // b0.val[0]: 00 10 02 12 04 14 06 16  40 50 42 52 44 54 46 56
35   // b0.val[1]: 01 11 03 13 05 15 07 17  41 51 43 53 45 55 47 57
36   // b1.val[0]: 20 30 22 32 24 34 26 36  60 70 62 72 64 74 66 76
37   // b1.val[1]: 21 31 23 33 25 35 27 37  61 71 63 73 65 75 67 77
38 
39   const uint8x16x2_t b0 = vtrnq_u8(vcombine_u8(a0, a4), vcombine_u8(a1, a5));
40   const uint8x16x2_t b1 = vtrnq_u8(vcombine_u8(a2, a6), vcombine_u8(a3, a7));
41 
42   // Swap 16 bit elements resulting in:
43   // c0.val[0]: 00 10 20 30 04 14 24 34  40 50 60 70 44 54 64 74
44   // c0.val[1]: 02 12 22 32 06 16 26 36  42 52 62 72 46 56 66 76
45   // c1.val[0]: 01 11 21 31 05 15 25 35  41 51 61 71 45 55 65 75
46   // c1.val[1]: 03 13 23 33 07 17 27 37  43 53 63 73 47 57 67 77
47 
48   const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
49                                     vreinterpretq_u16_u8(b1.val[0]));
50   const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
51                                     vreinterpretq_u16_u8(b1.val[1]));
52 
53   // Unzip 32 bit elements resulting in:
54   // d0.val[0]: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
55   // d0.val[1]: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
56   // d1.val[0]: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
57   // d1.val[1]: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
58   const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
59                                     vreinterpretq_u32_u16(c1.val[0]));
60   const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
61                                     vreinterpretq_u32_u16(c1.val[1]));
62 
63   *o0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
64   *o1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
65   *o2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
66   *o3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
67   *o4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
68   *o5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
69   *o6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
70   *o7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
71 }
72 
transpose_elems_inplace_u8_8x8(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3,uint8x8_t * a4,uint8x8_t * a5,uint8x8_t * a6,uint8x8_t * a7)73 static INLINE void transpose_elems_inplace_u8_8x8(uint8x8_t *a0, uint8x8_t *a1,
74                                                   uint8x8_t *a2, uint8x8_t *a3,
75                                                   uint8x8_t *a4, uint8x8_t *a5,
76                                                   uint8x8_t *a6,
77                                                   uint8x8_t *a7) {
78   transpose_elems_u8_8x8(*a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7, a0, a1, a2, a3,
79                          a4, a5, a6, a7);
80 }
81 
transpose_arrays_u8_8x8(const uint8x8_t * in,uint8x8_t * out)82 static INLINE void transpose_arrays_u8_8x8(const uint8x8_t *in,
83                                            uint8x8_t *out) {
84   transpose_elems_u8_8x8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],
85                          &out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
86                          &out[6], &out[7]);
87 }
88 
transpose_arrays_u8_8x16(const uint8x8_t * x,uint8x16_t * d)89 static AOM_FORCE_INLINE void transpose_arrays_u8_8x16(const uint8x8_t *x,
90                                                       uint8x16_t *d) {
91   uint8x8x2_t w0 = vzip_u8(x[0], x[1]);
92   uint8x8x2_t w1 = vzip_u8(x[2], x[3]);
93   uint8x8x2_t w2 = vzip_u8(x[4], x[5]);
94   uint8x8x2_t w3 = vzip_u8(x[6], x[7]);
95 
96   uint8x8x2_t w8 = vzip_u8(x[8], x[9]);
97   uint8x8x2_t w9 = vzip_u8(x[10], x[11]);
98   uint8x8x2_t w10 = vzip_u8(x[12], x[13]);
99   uint8x8x2_t w11 = vzip_u8(x[14], x[15]);
100 
101   uint16x4x2_t w4 =
102       vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]));
103   uint16x4x2_t w5 =
104       vzip_u16(vreinterpret_u16_u8(w2.val[0]), vreinterpret_u16_u8(w3.val[0]));
105   uint16x4x2_t w12 =
106       vzip_u16(vreinterpret_u16_u8(w8.val[0]), vreinterpret_u16_u8(w9.val[0]));
107   uint16x4x2_t w13 = vzip_u16(vreinterpret_u16_u8(w10.val[0]),
108                               vreinterpret_u16_u8(w11.val[0]));
109 
110   uint32x2x2_t w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
111                              vreinterpret_u32_u16(w5.val[0]));
112   uint32x2x2_t w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
113                              vreinterpret_u32_u16(w5.val[1]));
114   uint32x2x2_t w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]),
115                               vreinterpret_u32_u16(w13.val[0]));
116   uint32x2x2_t w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]),
117                               vreinterpret_u32_u16(w13.val[1]));
118 
119   // Store first 4-line result
120   d[0] = vreinterpretq_u8_u32(vcombine_u32(w6.val[0], w14.val[0]));
121   d[1] = vreinterpretq_u8_u32(vcombine_u32(w6.val[1], w14.val[1]));
122   d[2] = vreinterpretq_u8_u32(vcombine_u32(w7.val[0], w15.val[0]));
123   d[3] = vreinterpretq_u8_u32(vcombine_u32(w7.val[1], w15.val[1]));
124 
125   w4 = vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1]));
126   w5 = vzip_u16(vreinterpret_u16_u8(w2.val[1]), vreinterpret_u16_u8(w3.val[1]));
127   w12 =
128       vzip_u16(vreinterpret_u16_u8(w8.val[1]), vreinterpret_u16_u8(w9.val[1]));
129   w13 = vzip_u16(vreinterpret_u16_u8(w10.val[1]),
130                  vreinterpret_u16_u8(w11.val[1]));
131 
132   w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
133                 vreinterpret_u32_u16(w5.val[0]));
134   w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
135                 vreinterpret_u32_u16(w5.val[1]));
136   w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]),
137                  vreinterpret_u32_u16(w13.val[0]));
138   w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]),
139                  vreinterpret_u32_u16(w13.val[1]));
140 
141   // Store second 4-line result
142   d[4] = vreinterpretq_u8_u32(vcombine_u32(w6.val[0], w14.val[0]));
143   d[5] = vreinterpretq_u8_u32(vcombine_u32(w6.val[1], w14.val[1]));
144   d[6] = vreinterpretq_u8_u32(vcombine_u32(w7.val[0], w15.val[0]));
145   d[7] = vreinterpretq_u8_u32(vcombine_u32(w7.val[1], w15.val[1]));
146 }
147 
transpose_arrays_u8_16x8(const uint8x16_t * x,uint8x8_t * d)148 static AOM_FORCE_INLINE void transpose_arrays_u8_16x8(const uint8x16_t *x,
149                                                       uint8x8_t *d) {
150   uint8x16x2_t w0 = vzipq_u8(x[0], x[1]);
151   uint8x16x2_t w1 = vzipq_u8(x[2], x[3]);
152   uint8x16x2_t w2 = vzipq_u8(x[4], x[5]);
153   uint8x16x2_t w3 = vzipq_u8(x[6], x[7]);
154 
155   uint16x8x2_t w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
156                               vreinterpretq_u16_u8(w1.val[0]));
157   uint16x8x2_t w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
158                               vreinterpretq_u16_u8(w3.val[0]));
159   uint16x8x2_t w6 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
160                               vreinterpretq_u16_u8(w1.val[1]));
161   uint16x8x2_t w7 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
162                               vreinterpretq_u16_u8(w3.val[1]));
163 
164   uint32x4x2_t w8 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]),
165                               vreinterpretq_u32_u16(w5.val[0]));
166   uint32x4x2_t w9 = vzipq_u32(vreinterpretq_u32_u16(w6.val[0]),
167                               vreinterpretq_u32_u16(w7.val[0]));
168   uint32x4x2_t w10 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]),
169                                vreinterpretq_u32_u16(w5.val[1]));
170   uint32x4x2_t w11 = vzipq_u32(vreinterpretq_u32_u16(w6.val[1]),
171                                vreinterpretq_u32_u16(w7.val[1]));
172 
173   d[0] = vreinterpret_u8_u32(vget_low_u32(w8.val[0]));
174   d[1] = vreinterpret_u8_u32(vget_high_u32(w8.val[0]));
175   d[2] = vreinterpret_u8_u32(vget_low_u32(w8.val[1]));
176   d[3] = vreinterpret_u8_u32(vget_high_u32(w8.val[1]));
177   d[4] = vreinterpret_u8_u32(vget_low_u32(w10.val[0]));
178   d[5] = vreinterpret_u8_u32(vget_high_u32(w10.val[0]));
179   d[6] = vreinterpret_u8_u32(vget_low_u32(w10.val[1]));
180   d[7] = vreinterpret_u8_u32(vget_high_u32(w10.val[1]));
181   d[8] = vreinterpret_u8_u32(vget_low_u32(w9.val[0]));
182   d[9] = vreinterpret_u8_u32(vget_high_u32(w9.val[0]));
183   d[10] = vreinterpret_u8_u32(vget_low_u32(w9.val[1]));
184   d[11] = vreinterpret_u8_u32(vget_high_u32(w9.val[1]));
185   d[12] = vreinterpret_u8_u32(vget_low_u32(w11.val[0]));
186   d[13] = vreinterpret_u8_u32(vget_high_u32(w11.val[0]));
187   d[14] = vreinterpret_u8_u32(vget_low_u32(w11.val[1]));
188   d[15] = vreinterpret_u8_u32(vget_high_u32(w11.val[1]));
189 }
190 
aom_vtrnq_u64_to_u16(uint32x4_t a0,uint32x4_t a1)191 static INLINE uint16x8x2_t aom_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) {
192   uint16x8x2_t b0;
193 #if AOM_ARCH_AARCH64
194   b0.val[0] = vreinterpretq_u16_u64(
195       vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
196   b0.val[1] = vreinterpretq_u16_u64(
197       vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
198 #else
199   b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
200                            vreinterpret_u16_u32(vget_low_u32(a1)));
201   b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
202                            vreinterpret_u16_u32(vget_high_u32(a1)));
203 #endif
204   return b0;
205 }
206 
transpose_arrays_u8_16x16(const uint8x16_t * x,uint8x16_t * d)207 static INLINE void transpose_arrays_u8_16x16(const uint8x16_t *x,
208                                              uint8x16_t *d) {
209   uint8x16x2_t w0 = vzipq_u8(x[0], x[1]);
210   uint8x16x2_t w1 = vzipq_u8(x[2], x[3]);
211   uint8x16x2_t w2 = vzipq_u8(x[4], x[5]);
212   uint8x16x2_t w3 = vzipq_u8(x[6], x[7]);
213 
214   uint8x16x2_t w4 = vzipq_u8(x[8], x[9]);
215   uint8x16x2_t w5 = vzipq_u8(x[10], x[11]);
216   uint8x16x2_t w6 = vzipq_u8(x[12], x[13]);
217   uint8x16x2_t w7 = vzipq_u8(x[14], x[15]);
218 
219   uint16x8x2_t w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
220                               vreinterpretq_u16_u8(w1.val[0]));
221   uint16x8x2_t w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
222                               vreinterpretq_u16_u8(w3.val[0]));
223   uint16x8x2_t w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[0]),
224                                vreinterpretq_u16_u8(w5.val[0]));
225   uint16x8x2_t w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[0]),
226                                vreinterpretq_u16_u8(w7.val[0]));
227 
228   uint32x4x2_t w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]),
229                                vreinterpretq_u32_u16(w9.val[0]));
230   uint32x4x2_t w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]),
231                                vreinterpretq_u32_u16(w11.val[0]));
232   uint32x4x2_t w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]),
233                                vreinterpretq_u32_u16(w9.val[1]));
234   uint32x4x2_t w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]),
235                                vreinterpretq_u32_u16(w11.val[1]));
236 
237   uint16x8x2_t d01 = aom_vtrnq_u64_to_u16(w12.val[0], w13.val[0]);
238   d[0] = vreinterpretq_u8_u16(d01.val[0]);
239   d[1] = vreinterpretq_u8_u16(d01.val[1]);
240   uint16x8x2_t d23 = aom_vtrnq_u64_to_u16(w12.val[1], w13.val[1]);
241   d[2] = vreinterpretq_u8_u16(d23.val[0]);
242   d[3] = vreinterpretq_u8_u16(d23.val[1]);
243   uint16x8x2_t d45 = aom_vtrnq_u64_to_u16(w14.val[0], w15.val[0]);
244   d[4] = vreinterpretq_u8_u16(d45.val[0]);
245   d[5] = vreinterpretq_u8_u16(d45.val[1]);
246   uint16x8x2_t d67 = aom_vtrnq_u64_to_u16(w14.val[1], w15.val[1]);
247   d[6] = vreinterpretq_u8_u16(d67.val[0]);
248   d[7] = vreinterpretq_u8_u16(d67.val[1]);
249 
250   // upper half
251   w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
252                  vreinterpretq_u16_u8(w1.val[1]));
253   w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
254                  vreinterpretq_u16_u8(w3.val[1]));
255   w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[1]),
256                   vreinterpretq_u16_u8(w5.val[1]));
257   w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[1]),
258                   vreinterpretq_u16_u8(w7.val[1]));
259 
260   w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]),
261                   vreinterpretq_u32_u16(w9.val[0]));
262   w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]),
263                   vreinterpretq_u32_u16(w11.val[0]));
264   w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]),
265                   vreinterpretq_u32_u16(w9.val[1]));
266   w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]),
267                   vreinterpretq_u32_u16(w11.val[1]));
268 
269   d01 = aom_vtrnq_u64_to_u16(w12.val[0], w13.val[0]);
270   d[8] = vreinterpretq_u8_u16(d01.val[0]);
271   d[9] = vreinterpretq_u8_u16(d01.val[1]);
272   d23 = aom_vtrnq_u64_to_u16(w12.val[1], w13.val[1]);
273   d[10] = vreinterpretq_u8_u16(d23.val[0]);
274   d[11] = vreinterpretq_u8_u16(d23.val[1]);
275   d45 = aom_vtrnq_u64_to_u16(w14.val[0], w15.val[0]);
276   d[12] = vreinterpretq_u8_u16(d45.val[0]);
277   d[13] = vreinterpretq_u8_u16(d45.val[1]);
278   d67 = aom_vtrnq_u64_to_u16(w14.val[1], w15.val[1]);
279   d[14] = vreinterpretq_u8_u16(d67.val[0]);
280   d[15] = vreinterpretq_u8_u16(d67.val[1]);
281 }
282 
transpose_arrays_u8_32x16(const uint8x16x2_t * x,uint8x16_t * d)283 static AOM_FORCE_INLINE void transpose_arrays_u8_32x16(const uint8x16x2_t *x,
284                                                        uint8x16_t *d) {
285   uint8x16_t x2[32];
286   for (int i = 0; i < 16; ++i) {
287     x2[i] = x[i].val[0];
288     x2[i + 16] = x[i].val[1];
289   }
290   transpose_arrays_u8_16x16(x2, d);
291   transpose_arrays_u8_16x16(x2 + 16, d + 16);
292 }
293 
transpose_elems_inplace_u8_8x4(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3)294 static INLINE void transpose_elems_inplace_u8_8x4(uint8x8_t *a0, uint8x8_t *a1,
295                                                   uint8x8_t *a2,
296                                                   uint8x8_t *a3) {
297   // Swap 8 bit elements. Goes from:
298   // a0: 00 01 02 03 04 05 06 07
299   // a1: 10 11 12 13 14 15 16 17
300   // a2: 20 21 22 23 24 25 26 27
301   // a3: 30 31 32 33 34 35 36 37
302   // to:
303   // b0.val[0]: 00 10 02 12 04 14 06 16
304   // b0.val[1]: 01 11 03 13 05 15 07 17
305   // b1.val[0]: 20 30 22 32 24 34 26 36
306   // b1.val[1]: 21 31 23 33 25 35 27 37
307 
308   const uint8x8x2_t b0 = vtrn_u8(*a0, *a1);
309   const uint8x8x2_t b1 = vtrn_u8(*a2, *a3);
310 
311   // Swap 16 bit elements resulting in:
312   // c0.val[0]: 00 10 20 30 04 14 24 34
313   // c0.val[1]: 02 12 22 32 06 16 26 36
314   // c1.val[0]: 01 11 21 31 05 15 25 35
315   // c1.val[1]: 03 13 23 33 07 17 27 37
316 
317   const uint16x4x2_t c0 =
318       vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0]));
319   const uint16x4x2_t c1 =
320       vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1]));
321 
322   *a0 = vreinterpret_u8_u16(c0.val[0]);
323   *a1 = vreinterpret_u8_u16(c1.val[0]);
324   *a2 = vreinterpret_u8_u16(c0.val[1]);
325   *a3 = vreinterpret_u8_u16(c1.val[1]);
326 }
327 
transpose_elems_inplace_u8_4x4(uint8x8_t * a0,uint8x8_t * a1)328 static INLINE void transpose_elems_inplace_u8_4x4(uint8x8_t *a0,
329                                                   uint8x8_t *a1) {
330   // Swap 16 bit elements. Goes from:
331   // a0: 00 01 02 03  10 11 12 13
332   // a1: 20 21 22 23  30 31 32 33
333   // to:
334   // b0.val[0]: 00 01 20 21  10 11 30 31
335   // b0.val[1]: 02 03 22 23  12 13 32 33
336 
337   const uint16x4x2_t b0 =
338       vtrn_u16(vreinterpret_u16_u8(*a0), vreinterpret_u16_u8(*a1));
339 
340   // Swap 32 bit elements resulting in:
341   // c0.val[0]: 00 01 20 21  02 03 22 23
342   // c0.val[1]: 10 11 30 31  12 13 32 33
343 
344   const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
345                                    vreinterpret_u32_u16(b0.val[1]));
346 
347   // Swap 8 bit elements resulting in:
348   // d0.val[0]: 00 10 20 30  02 12 22 32
349   // d0.val[1]: 01 11 21 31  03 13 23 33
350 
351   const uint8x8x2_t d0 =
352       vtrn_u8(vreinterpret_u8_u32(c0.val[0]), vreinterpret_u8_u32(c0.val[1]));
353 
354   *a0 = d0.val[0];
355   *a1 = d0.val[1];
356 }
357 
transpose_elems_u8_4x8(uint8x8_t a0,uint8x8_t a1,uint8x8_t a2,uint8x8_t a3,uint8x8_t a4,uint8x8_t a5,uint8x8_t a6,uint8x8_t a7,uint8x8_t * o0,uint8x8_t * o1,uint8x8_t * o2,uint8x8_t * o3)358 static INLINE void transpose_elems_u8_4x8(uint8x8_t a0, uint8x8_t a1,
359                                           uint8x8_t a2, uint8x8_t a3,
360                                           uint8x8_t a4, uint8x8_t a5,
361                                           uint8x8_t a6, uint8x8_t a7,
362                                           uint8x8_t *o0, uint8x8_t *o1,
363                                           uint8x8_t *o2, uint8x8_t *o3) {
364   // Swap 32 bit elements. Goes from:
365   // a0: 00 01 02 03 XX XX XX XX
366   // a1: 10 11 12 13 XX XX XX XX
367   // a2: 20 21 22 23 XX XX XX XX
368   // a3; 30 31 32 33 XX XX XX XX
369   // a4: 40 41 42 43 XX XX XX XX
370   // a5: 50 51 52 53 XX XX XX XX
371   // a6: 60 61 62 63 XX XX XX XX
372   // a7: 70 71 72 73 XX XX XX XX
373   // to:
374   // b0.val[0]: 00 01 02 03 40 41 42 43
375   // b1.val[0]: 10 11 12 13 50 51 52 53
376   // b2.val[0]: 20 21 22 23 60 61 62 63
377   // b3.val[0]: 30 31 32 33 70 71 72 73
378 
379   const uint32x2x2_t b0 =
380       vtrn_u32(vreinterpret_u32_u8(a0), vreinterpret_u32_u8(a4));
381   const uint32x2x2_t b1 =
382       vtrn_u32(vreinterpret_u32_u8(a1), vreinterpret_u32_u8(a5));
383   const uint32x2x2_t b2 =
384       vtrn_u32(vreinterpret_u32_u8(a2), vreinterpret_u32_u8(a6));
385   const uint32x2x2_t b3 =
386       vtrn_u32(vreinterpret_u32_u8(a3), vreinterpret_u32_u8(a7));
387 
388   // Swap 16 bit elements resulting in:
389   // c0.val[0]: 00 01 20 21 40 41 60 61
390   // c0.val[1]: 02 03 22 23 42 43 62 63
391   // c1.val[0]: 10 11 30 31 50 51 70 71
392   // c1.val[1]: 12 13 32 33 52 53 72 73
393 
394   const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]),
395                                    vreinterpret_u16_u32(b2.val[0]));
396   const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]),
397                                    vreinterpret_u16_u32(b3.val[0]));
398 
399   // Swap 8 bit elements resulting in:
400   // d0.val[0]: 00 10 20 30 40 50 60 70
401   // d0.val[1]: 01 11 21 31 41 51 61 71
402   // d1.val[0]: 02 12 22 32 42 52 62 72
403   // d1.val[1]: 03 13 23 33 43 53 63 73
404 
405   const uint8x8x2_t d0 =
406       vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0]));
407   const uint8x8x2_t d1 =
408       vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1]));
409 
410   *o0 = d0.val[0];
411   *o1 = d0.val[1];
412   *o2 = d1.val[0];
413   *o3 = d1.val[1];
414 }
415 
transpose_array_inplace_u16_4x4(uint16x4_t a[4])416 static INLINE void transpose_array_inplace_u16_4x4(uint16x4_t a[4]) {
417   // Input:
418   // 00 01 02 03
419   // 10 11 12 13
420   // 20 21 22 23
421   // 30 31 32 33
422 
423   // b:
424   // 00 10 02 12
425   // 01 11 03 13
426   const uint16x4x2_t b = vtrn_u16(a[0], a[1]);
427   // c:
428   // 20 30 22 32
429   // 21 31 23 33
430   const uint16x4x2_t c = vtrn_u16(a[2], a[3]);
431   // d:
432   // 00 10 20 30
433   // 02 12 22 32
434   const uint32x2x2_t d =
435       vtrn_u32(vreinterpret_u32_u16(b.val[0]), vreinterpret_u32_u16(c.val[0]));
436   // e:
437   // 01 11 21 31
438   // 03 13 23 33
439   const uint32x2x2_t e =
440       vtrn_u32(vreinterpret_u32_u16(b.val[1]), vreinterpret_u32_u16(c.val[1]));
441 
442   // Output:
443   // 00 10 20 30
444   // 01 11 21 31
445   // 02 12 22 32
446   // 03 13 23 33
447   a[0] = vreinterpret_u16_u32(d.val[0]);
448   a[1] = vreinterpret_u16_u32(e.val[0]);
449   a[2] = vreinterpret_u16_u32(d.val[1]);
450   a[3] = vreinterpret_u16_u32(e.val[1]);
451 }
452 
transpose_array_inplace_u16_4x8(uint16x8_t a[4])453 static INLINE void transpose_array_inplace_u16_4x8(uint16x8_t a[4]) {
454   // 4x8 Input:
455   // a[0]: 00 01 02 03 04 05 06 07
456   // a[1]: 10 11 12 13 14 15 16 17
457   // a[2]: 20 21 22 23 24 25 26 27
458   // a[3]: 30 31 32 33 34 35 36 37
459 
460   // b0.val[0]: 00 10 02 12 04 14 06 16
461   // b0.val[1]: 01 11 03 13 05 15 07 17
462   // b1.val[0]: 20 30 22 32 24 34 26 36
463   // b1.val[1]: 21 31 23 33 25 35 27 37
464   const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
465   const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
466 
467   // c0.val[0]: 00 10 20 30 04 14 24 34
468   // c0.val[1]: 02 12 22 32 06 16 26 36
469   // c1.val[0]: 01 11 21 31 05 15 25 35
470   // c1.val[1]: 03 13 23 33 07 17 27 37
471   const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
472                                     vreinterpretq_u32_u16(b1.val[0]));
473   const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
474                                     vreinterpretq_u32_u16(b1.val[1]));
475 
476   // 8x4 Output:
477   // a[0]: 00 10 20 30 04 14 24 34
478   // a[1]: 01 11 21 31 05 15 25 35
479   // a[2]: 02 12 22 32 06 16 26 36
480   // a[3]: 03 13 23 33 07 17 27 37
481   a[0] = vreinterpretq_u16_u32(c0.val[0]);
482   a[1] = vreinterpretq_u16_u32(c1.val[0]);
483   a[2] = vreinterpretq_u16_u32(c0.val[1]);
484   a[3] = vreinterpretq_u16_u32(c1.val[1]);
485 }
486 
487 // Special transpose for loop filter.
488 // 4x8 Input:
489 // p_q:  p3 p2 p1 p0 q0 q1 q2 q3
490 // a[0]: 00 01 02 03 04 05 06 07
491 // a[1]: 10 11 12 13 14 15 16 17
492 // a[2]: 20 21 22 23 24 25 26 27
493 // a[3]: 30 31 32 33 34 35 36 37
494 // 8x4 Output:
495 // a[0]: 03 13 23 33 04 14 24 34  p0q0
496 // a[1]: 02 12 22 32 05 15 25 35  p1q1
497 // a[2]: 01 11 21 31 06 16 26 36  p2q2
498 // a[3]: 00 10 20 30 07 17 27 37  p3q3
499 // Direct reapplication of the function will reset the high halves, but
500 // reverse the low halves:
501 // p_q:  p0 p1 p2 p3 q0 q1 q2 q3
502 // a[0]: 33 32 31 30 04 05 06 07
503 // a[1]: 23 22 21 20 14 15 16 17
504 // a[2]: 13 12 11 10 24 25 26 27
505 // a[3]: 03 02 01 00 34 35 36 37
506 // Simply reordering the inputs (3, 2, 1, 0) will reset the low halves, but
507 // reverse the high halves.
508 // The standard transpose_u16_4x8q will produce the same reversals, but with the
509 // order of the low halves also restored relative to the high halves. This is
510 // preferable because it puts all values from the same source row back together,
511 // but some post-processing is inevitable.
loop_filter_transpose_u16_4x8q(uint16x8_t a[4])512 static INLINE void loop_filter_transpose_u16_4x8q(uint16x8_t a[4]) {
513   // b0.val[0]: 00 10 02 12 04 14 06 16
514   // b0.val[1]: 01 11 03 13 05 15 07 17
515   // b1.val[0]: 20 30 22 32 24 34 26 36
516   // b1.val[1]: 21 31 23 33 25 35 27 37
517   const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
518   const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
519 
520   // Reverse odd vectors to bring the appropriate items to the front of zips.
521   // b0.val[0]: 00 10 02 12 04 14 06 16
522   // r0       : 03 13 01 11 07 17 05 15
523   // b1.val[0]: 20 30 22 32 24 34 26 36
524   // r1       : 23 33 21 31 27 37 25 35
525   const uint32x4_t r0 = vrev64q_u32(vreinterpretq_u32_u16(b0.val[1]));
526   const uint32x4_t r1 = vrev64q_u32(vreinterpretq_u32_u16(b1.val[1]));
527 
528   // Zip to complete the halves.
529   // c0.val[0]: 00 10 20 30 02 12 22 32  p3p1
530   // c0.val[1]: 04 14 24 34 06 16 26 36  q0q2
531   // c1.val[0]: 03 13 23 33 01 11 21 31  p0p2
532   // c1.val[1]: 07 17 27 37 05 15 25 35  q3q1
533   const uint32x4x2_t c0 = vzipq_u32(vreinterpretq_u32_u16(b0.val[0]),
534                                     vreinterpretq_u32_u16(b1.val[0]));
535   const uint32x4x2_t c1 = vzipq_u32(r0, r1);
536 
537   // d0.val[0]: 00 10 20 30 07 17 27 37  p3q3
538   // d0.val[1]: 02 12 22 32 05 15 25 35  p1q1
539   // d1.val[0]: 03 13 23 33 04 14 24 34  p0q0
540   // d1.val[1]: 01 11 21 31 06 16 26 36  p2q2
541   const uint16x8x2_t d0 = aom_vtrnq_u64_to_u16(c0.val[0], c1.val[1]);
542   // The third row of c comes first here to swap p2 with q0.
543   const uint16x8x2_t d1 = aom_vtrnq_u64_to_u16(c1.val[0], c0.val[1]);
544 
545   // 8x4 Output:
546   // a[0]: 03 13 23 33 04 14 24 34  p0q0
547   // a[1]: 02 12 22 32 05 15 25 35  p1q1
548   // a[2]: 01 11 21 31 06 16 26 36  p2q2
549   // a[3]: 00 10 20 30 07 17 27 37  p3q3
550   a[0] = d1.val[0];  // p0q0
551   a[1] = d0.val[1];  // p1q1
552   a[2] = d1.val[1];  // p2q2
553   a[3] = d0.val[0];  // p3q3
554 }
555 
transpose_elems_u16_4x8(const uint16x4_t a0,const uint16x4_t a1,const uint16x4_t a2,const uint16x4_t a3,const uint16x4_t a4,const uint16x4_t a5,const uint16x4_t a6,const uint16x4_t a7,uint16x8_t * o0,uint16x8_t * o1,uint16x8_t * o2,uint16x8_t * o3)556 static INLINE void transpose_elems_u16_4x8(
557     const uint16x4_t a0, const uint16x4_t a1, const uint16x4_t a2,
558     const uint16x4_t a3, const uint16x4_t a4, const uint16x4_t a5,
559     const uint16x4_t a6, const uint16x4_t a7, uint16x8_t *o0, uint16x8_t *o1,
560     uint16x8_t *o2, uint16x8_t *o3) {
561   // Combine rows. Goes from:
562   // a0: 00 01 02 03
563   // a1: 10 11 12 13
564   // a2: 20 21 22 23
565   // a3: 30 31 32 33
566   // a4: 40 41 42 43
567   // a5: 50 51 52 53
568   // a6: 60 61 62 63
569   // a7: 70 71 72 73
570   // to:
571   // b0: 00 01 02 03 40 41 42 43
572   // b1: 10 11 12 13 50 51 52 53
573   // b2: 20 21 22 23 60 61 62 63
574   // b3: 30 31 32 33 70 71 72 73
575 
576   const uint16x8_t b0 = vcombine_u16(a0, a4);
577   const uint16x8_t b1 = vcombine_u16(a1, a5);
578   const uint16x8_t b2 = vcombine_u16(a2, a6);
579   const uint16x8_t b3 = vcombine_u16(a3, a7);
580 
581   // Swap 16 bit elements resulting in:
582   // c0.val[0]: 00 10 02 12 40 50 42 52
583   // c0.val[1]: 01 11 03 13 41 51 43 53
584   // c1.val[0]: 20 30 22 32 60 70 62 72
585   // c1.val[1]: 21 31 23 33 61 71 63 73
586 
587   const uint16x8x2_t c0 = vtrnq_u16(b0, b1);
588   const uint16x8x2_t c1 = vtrnq_u16(b2, b3);
589 
590   // Swap 32 bit elements resulting in:
591   // d0.val[0]: 00 10 20 30 40 50 60 70
592   // d0.val[1]: 02 12 22 32 42 52 62 72
593   // d1.val[0]: 01 11 21 31 41 51 61 71
594   // d1.val[1]: 03 13 23 33 43 53 63 73
595 
596   const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
597                                     vreinterpretq_u32_u16(c1.val[0]));
598   const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
599                                     vreinterpretq_u32_u16(c1.val[1]));
600 
601   *o0 = vreinterpretq_u16_u32(d0.val[0]);
602   *o1 = vreinterpretq_u16_u32(d1.val[0]);
603   *o2 = vreinterpretq_u16_u32(d0.val[1]);
604   *o3 = vreinterpretq_u16_u32(d1.val[1]);
605 }
606 
transpose_elems_s16_4x8(const int16x4_t a0,const int16x4_t a1,const int16x4_t a2,const int16x4_t a3,const int16x4_t a4,const int16x4_t a5,const int16x4_t a6,const int16x4_t a7,int16x8_t * o0,int16x8_t * o1,int16x8_t * o2,int16x8_t * o3)607 static INLINE void transpose_elems_s16_4x8(
608     const int16x4_t a0, const int16x4_t a1, const int16x4_t a2,
609     const int16x4_t a3, const int16x4_t a4, const int16x4_t a5,
610     const int16x4_t a6, const int16x4_t a7, int16x8_t *o0, int16x8_t *o1,
611     int16x8_t *o2, int16x8_t *o3) {
612   // Combine rows. Goes from:
613   // a0: 00 01 02 03
614   // a1: 10 11 12 13
615   // a2: 20 21 22 23
616   // a3: 30 31 32 33
617   // a4: 40 41 42 43
618   // a5: 50 51 52 53
619   // a6: 60 61 62 63
620   // a7: 70 71 72 73
621   // to:
622   // b0: 00 01 02 03 40 41 42 43
623   // b1: 10 11 12 13 50 51 52 53
624   // b2: 20 21 22 23 60 61 62 63
625   // b3: 30 31 32 33 70 71 72 73
626 
627   const int16x8_t b0 = vcombine_s16(a0, a4);
628   const int16x8_t b1 = vcombine_s16(a1, a5);
629   const int16x8_t b2 = vcombine_s16(a2, a6);
630   const int16x8_t b3 = vcombine_s16(a3, a7);
631 
632   // Swap 16 bit elements resulting in:
633   // c0.val[0]: 00 10 02 12 40 50 42 52
634   // c0.val[1]: 01 11 03 13 41 51 43 53
635   // c1.val[0]: 20 30 22 32 60 70 62 72
636   // c1.val[1]: 21 31 23 33 61 71 63 73
637 
638   const int16x8x2_t c0 = vtrnq_s16(b0, b1);
639   const int16x8x2_t c1 = vtrnq_s16(b2, b3);
640 
641   // Swap 32 bit elements resulting in:
642   // d0.val[0]: 00 10 20 30 40 50 60 70
643   // d0.val[1]: 02 12 22 32 42 52 62 72
644   // d1.val[0]: 01 11 21 31 41 51 61 71
645   // d1.val[1]: 03 13 23 33 43 53 63 73
646 
647   const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
648                                    vreinterpretq_s32_s16(c1.val[0]));
649   const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
650                                    vreinterpretq_s32_s16(c1.val[1]));
651 
652   *o0 = vreinterpretq_s16_s32(d0.val[0]);
653   *o1 = vreinterpretq_s16_s32(d1.val[0]);
654   *o2 = vreinterpretq_s16_s32(d0.val[1]);
655   *o3 = vreinterpretq_s16_s32(d1.val[1]);
656 }
657 
transpose_elems_inplace_u16_8x8(uint16x8_t * a0,uint16x8_t * a1,uint16x8_t * a2,uint16x8_t * a3,uint16x8_t * a4,uint16x8_t * a5,uint16x8_t * a6,uint16x8_t * a7)658 static INLINE void transpose_elems_inplace_u16_8x8(
659     uint16x8_t *a0, uint16x8_t *a1, uint16x8_t *a2, uint16x8_t *a3,
660     uint16x8_t *a4, uint16x8_t *a5, uint16x8_t *a6, uint16x8_t *a7) {
661   // Swap 16 bit elements. Goes from:
662   // a0: 00 01 02 03 04 05 06 07
663   // a1: 10 11 12 13 14 15 16 17
664   // a2: 20 21 22 23 24 25 26 27
665   // a3: 30 31 32 33 34 35 36 37
666   // a4: 40 41 42 43 44 45 46 47
667   // a5: 50 51 52 53 54 55 56 57
668   // a6: 60 61 62 63 64 65 66 67
669   // a7: 70 71 72 73 74 75 76 77
670   // to:
671   // b0.val[0]: 00 10 02 12 04 14 06 16
672   // b0.val[1]: 01 11 03 13 05 15 07 17
673   // b1.val[0]: 20 30 22 32 24 34 26 36
674   // b1.val[1]: 21 31 23 33 25 35 27 37
675   // b2.val[0]: 40 50 42 52 44 54 46 56
676   // b2.val[1]: 41 51 43 53 45 55 47 57
677   // b3.val[0]: 60 70 62 72 64 74 66 76
678   // b3.val[1]: 61 71 63 73 65 75 67 77
679 
680   const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
681   const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
682   const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5);
683   const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7);
684 
685   // Swap 32 bit elements resulting in:
686   // c0.val[0]: 00 10 20 30 04 14 24 34
687   // c0.val[1]: 02 12 22 32 06 16 26 36
688   // c1.val[0]: 01 11 21 31 05 15 25 35
689   // c1.val[1]: 03 13 23 33 07 17 27 37
690   // c2.val[0]: 40 50 60 70 44 54 64 74
691   // c2.val[1]: 42 52 62 72 46 56 66 76
692   // c3.val[0]: 41 51 61 71 45 55 65 75
693   // c3.val[1]: 43 53 63 73 47 57 67 77
694 
695   const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
696                                     vreinterpretq_u32_u16(b1.val[0]));
697   const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
698                                     vreinterpretq_u32_u16(b1.val[1]));
699   const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]),
700                                     vreinterpretq_u32_u16(b3.val[0]));
701   const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]),
702                                     vreinterpretq_u32_u16(b3.val[1]));
703 
704   // Swap 64 bit elements resulting in:
705   // d0.val[0]: 00 10 20 30 40 50 60 70
706   // d0.val[1]: 04 14 24 34 44 54 64 74
707   // d1.val[0]: 01 11 21 31 41 51 61 71
708   // d1.val[1]: 05 15 25 35 45 55 65 75
709   // d2.val[0]: 02 12 22 32 42 52 62 72
710   // d2.val[1]: 06 16 26 36 46 56 66 76
711   // d3.val[0]: 03 13 23 33 43 53 63 73
712   // d3.val[1]: 07 17 27 37 47 57 67 77
713 
714   const uint16x8x2_t d0 = aom_vtrnq_u64_to_u16(c0.val[0], c2.val[0]);
715   const uint16x8x2_t d1 = aom_vtrnq_u64_to_u16(c1.val[0], c3.val[0]);
716   const uint16x8x2_t d2 = aom_vtrnq_u64_to_u16(c0.val[1], c2.val[1]);
717   const uint16x8x2_t d3 = aom_vtrnq_u64_to_u16(c1.val[1], c3.val[1]);
718 
719   *a0 = d0.val[0];
720   *a1 = d1.val[0];
721   *a2 = d2.val[0];
722   *a3 = d3.val[0];
723   *a4 = d0.val[1];
724   *a5 = d1.val[1];
725   *a6 = d2.val[1];
726   *a7 = d3.val[1];
727 }
728 
aom_vtrnq_s64_to_s16(int32x4_t a0,int32x4_t a1)729 static INLINE int16x8x2_t aom_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
730   int16x8x2_t b0;
731 #if AOM_ARCH_AARCH64
732   b0.val[0] = vreinterpretq_s16_s64(
733       vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
734   b0.val[1] = vreinterpretq_s16_s64(
735       vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
736 #else
737   b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
738                            vreinterpret_s16_s32(vget_low_s32(a1)));
739   b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
740                            vreinterpret_s16_s32(vget_high_s32(a1)));
741 #endif
742   return b0;
743 }
744 
transpose_elems_inplace_s16_8x8(int16x8_t * a0,int16x8_t * a1,int16x8_t * a2,int16x8_t * a3,int16x8_t * a4,int16x8_t * a5,int16x8_t * a6,int16x8_t * a7)745 static INLINE void transpose_elems_inplace_s16_8x8(int16x8_t *a0, int16x8_t *a1,
746                                                    int16x8_t *a2, int16x8_t *a3,
747                                                    int16x8_t *a4, int16x8_t *a5,
748                                                    int16x8_t *a6,
749                                                    int16x8_t *a7) {
750   // Swap 16 bit elements. Goes from:
751   // a0: 00 01 02 03 04 05 06 07
752   // a1: 10 11 12 13 14 15 16 17
753   // a2: 20 21 22 23 24 25 26 27
754   // a3: 30 31 32 33 34 35 36 37
755   // a4: 40 41 42 43 44 45 46 47
756   // a5: 50 51 52 53 54 55 56 57
757   // a6: 60 61 62 63 64 65 66 67
758   // a7: 70 71 72 73 74 75 76 77
759   // to:
760   // b0.val[0]: 00 10 02 12 04 14 06 16
761   // b0.val[1]: 01 11 03 13 05 15 07 17
762   // b1.val[0]: 20 30 22 32 24 34 26 36
763   // b1.val[1]: 21 31 23 33 25 35 27 37
764   // b2.val[0]: 40 50 42 52 44 54 46 56
765   // b2.val[1]: 41 51 43 53 45 55 47 57
766   // b3.val[0]: 60 70 62 72 64 74 66 76
767   // b3.val[1]: 61 71 63 73 65 75 67 77
768 
769   const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
770   const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
771   const int16x8x2_t b2 = vtrnq_s16(*a4, *a5);
772   const int16x8x2_t b3 = vtrnq_s16(*a6, *a7);
773 
774   // Swap 32 bit elements resulting in:
775   // c0.val[0]: 00 10 20 30 04 14 24 34
776   // c0.val[1]: 02 12 22 32 06 16 26 36
777   // c1.val[0]: 01 11 21 31 05 15 25 35
778   // c1.val[1]: 03 13 23 33 07 17 27 37
779   // c2.val[0]: 40 50 60 70 44 54 64 74
780   // c2.val[1]: 42 52 62 72 46 56 66 76
781   // c3.val[0]: 41 51 61 71 45 55 65 75
782   // c3.val[1]: 43 53 63 73 47 57 67 77
783 
784   const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
785                                    vreinterpretq_s32_s16(b1.val[0]));
786   const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
787                                    vreinterpretq_s32_s16(b1.val[1]));
788   const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
789                                    vreinterpretq_s32_s16(b3.val[0]));
790   const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
791                                    vreinterpretq_s32_s16(b3.val[1]));
792 
793   // Swap 64 bit elements resulting in:
794   // d0.val[0]: 00 10 20 30 40 50 60 70
795   // d0.val[1]: 04 14 24 34 44 54 64 74
796   // d1.val[0]: 01 11 21 31 41 51 61 71
797   // d1.val[1]: 05 15 25 35 45 55 65 75
798   // d2.val[0]: 02 12 22 32 42 52 62 72
799   // d2.val[1]: 06 16 26 36 46 56 66 76
800   // d3.val[0]: 03 13 23 33 43 53 63 73
801   // d3.val[1]: 07 17 27 37 47 57 67 77
802 
803   const int16x8x2_t d0 = aom_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
804   const int16x8x2_t d1 = aom_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
805   const int16x8x2_t d2 = aom_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
806   const int16x8x2_t d3 = aom_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
807 
808   *a0 = d0.val[0];
809   *a1 = d1.val[0];
810   *a2 = d2.val[0];
811   *a3 = d3.val[0];
812   *a4 = d0.val[1];
813   *a5 = d1.val[1];
814   *a6 = d2.val[1];
815   *a7 = d3.val[1];
816 }
817 
transpose_arrays_s16_8x8(const int16x8_t * a,int16x8_t * out)818 static INLINE void transpose_arrays_s16_8x8(const int16x8_t *a,
819                                             int16x8_t *out) {
820   // Swap 16 bit elements. Goes from:
821   // a0: 00 01 02 03 04 05 06 07
822   // a1: 10 11 12 13 14 15 16 17
823   // a2: 20 21 22 23 24 25 26 27
824   // a3: 30 31 32 33 34 35 36 37
825   // a4: 40 41 42 43 44 45 46 47
826   // a5: 50 51 52 53 54 55 56 57
827   // a6: 60 61 62 63 64 65 66 67
828   // a7: 70 71 72 73 74 75 76 77
829   // to:
830   // b0.val[0]: 00 10 02 12 04 14 06 16
831   // b0.val[1]: 01 11 03 13 05 15 07 17
832   // b1.val[0]: 20 30 22 32 24 34 26 36
833   // b1.val[1]: 21 31 23 33 25 35 27 37
834   // b2.val[0]: 40 50 42 52 44 54 46 56
835   // b2.val[1]: 41 51 43 53 45 55 47 57
836   // b3.val[0]: 60 70 62 72 64 74 66 76
837   // b3.val[1]: 61 71 63 73 65 75 67 77
838 
839   const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]);
840   const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]);
841   const int16x8x2_t b2 = vtrnq_s16(a[4], a[5]);
842   const int16x8x2_t b3 = vtrnq_s16(a[6], a[7]);
843 
844   // Swap 32 bit elements resulting in:
845   // c0.val[0]: 00 10 20 30 04 14 24 34
846   // c0.val[1]: 02 12 22 32 06 16 26 36
847   // c1.val[0]: 01 11 21 31 05 15 25 35
848   // c1.val[1]: 03 13 23 33 07 17 27 37
849   // c2.val[0]: 40 50 60 70 44 54 64 74
850   // c2.val[1]: 42 52 62 72 46 56 66 76
851   // c3.val[0]: 41 51 61 71 45 55 65 75
852   // c3.val[1]: 43 53 63 73 47 57 67 77
853 
854   const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
855                                    vreinterpretq_s32_s16(b1.val[0]));
856   const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
857                                    vreinterpretq_s32_s16(b1.val[1]));
858   const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
859                                    vreinterpretq_s32_s16(b3.val[0]));
860   const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
861                                    vreinterpretq_s32_s16(b3.val[1]));
862 
863   // Swap 64 bit elements resulting in:
864   // d0.val[0]: 00 10 20 30 40 50 60 70
865   // d0.val[1]: 04 14 24 34 44 54 64 74
866   // d1.val[0]: 01 11 21 31 41 51 61 71
867   // d1.val[1]: 05 15 25 35 45 55 65 75
868   // d2.val[0]: 02 12 22 32 42 52 62 72
869   // d2.val[1]: 06 16 26 36 46 56 66 76
870   // d3.val[0]: 03 13 23 33 43 53 63 73
871   // d3.val[1]: 07 17 27 37 47 57 67 77
872 
873   const int16x8x2_t d0 = aom_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
874   const int16x8x2_t d1 = aom_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
875   const int16x8x2_t d2 = aom_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
876   const int16x8x2_t d3 = aom_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
877 
878   out[0] = d0.val[0];
879   out[1] = d1.val[0];
880   out[2] = d2.val[0];
881   out[3] = d3.val[0];
882   out[4] = d0.val[1];
883   out[5] = d1.val[1];
884   out[6] = d2.val[1];
885   out[7] = d3.val[1];
886 }
887 
transpose_elems_inplace_u16_4x4(uint16x4_t * a0,uint16x4_t * a1,uint16x4_t * a2,uint16x4_t * a3)888 static INLINE void transpose_elems_inplace_u16_4x4(uint16x4_t *a0,
889                                                    uint16x4_t *a1,
890                                                    uint16x4_t *a2,
891                                                    uint16x4_t *a3) {
892   // Swap 16 bit elements. Goes from:
893   // a0: 00 01 02 03
894   // a1: 10 11 12 13
895   // a2: 20 21 22 23
896   // a3: 30 31 32 33
897   // to:
898   // b0.val[0]: 00 10 02 12
899   // b0.val[1]: 01 11 03 13
900   // b1.val[0]: 20 30 22 32
901   // b1.val[1]: 21 31 23 33
902 
903   const uint16x4x2_t b0 = vtrn_u16(*a0, *a1);
904   const uint16x4x2_t b1 = vtrn_u16(*a2, *a3);
905 
906   // Swap 32 bit elements resulting in:
907   // c0.val[0]: 00 10 20 30
908   // c0.val[1]: 02 12 22 32
909   // c1.val[0]: 01 11 21 31
910   // c1.val[1]: 03 13 23 33
911 
912   const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
913                                    vreinterpret_u32_u16(b1.val[0]));
914   const uint32x2x2_t c1 = vtrn_u32(vreinterpret_u32_u16(b0.val[1]),
915                                    vreinterpret_u32_u16(b1.val[1]));
916 
917   *a0 = vreinterpret_u16_u32(c0.val[0]);
918   *a1 = vreinterpret_u16_u32(c1.val[0]);
919   *a2 = vreinterpret_u16_u32(c0.val[1]);
920   *a3 = vreinterpret_u16_u32(c1.val[1]);
921 }
922 
transpose_elems_inplace_s16_4x4(int16x4_t * a0,int16x4_t * a1,int16x4_t * a2,int16x4_t * a3)923 static INLINE void transpose_elems_inplace_s16_4x4(int16x4_t *a0, int16x4_t *a1,
924                                                    int16x4_t *a2,
925                                                    int16x4_t *a3) {
926   // Swap 16 bit elements. Goes from:
927   // a0: 00 01 02 03
928   // a1: 10 11 12 13
929   // a2: 20 21 22 23
930   // a3: 30 31 32 33
931   // to:
932   // b0.val[0]: 00 10 02 12
933   // b0.val[1]: 01 11 03 13
934   // b1.val[0]: 20 30 22 32
935   // b1.val[1]: 21 31 23 33
936 
937   const int16x4x2_t b0 = vtrn_s16(*a0, *a1);
938   const int16x4x2_t b1 = vtrn_s16(*a2, *a3);
939 
940   // Swap 32 bit elements resulting in:
941   // c0.val[0]: 00 10 20 30
942   // c0.val[1]: 02 12 22 32
943   // c1.val[0]: 01 11 21 31
944   // c1.val[1]: 03 13 23 33
945 
946   const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
947                                   vreinterpret_s32_s16(b1.val[0]));
948   const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
949                                   vreinterpret_s32_s16(b1.val[1]));
950 
951   *a0 = vreinterpret_s16_s32(c0.val[0]);
952   *a1 = vreinterpret_s16_s32(c1.val[0]);
953   *a2 = vreinterpret_s16_s32(c0.val[1]);
954   *a3 = vreinterpret_s16_s32(c1.val[1]);
955 }
956 
aom_vtrnq_s64_to_s32(int32x4_t a0,int32x4_t a1)957 static INLINE int32x4x2_t aom_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
958   int32x4x2_t b0;
959 #if AOM_ARCH_AARCH64
960   b0.val[0] = vreinterpretq_s32_s64(
961       vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
962   b0.val[1] = vreinterpretq_s32_s64(
963       vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
964 #else
965   b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
966   b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
967 #endif
968   return b0;
969 }
970 
transpose_elems_s32_4x4(const int32x4_t a0,const int32x4_t a1,const int32x4_t a2,const int32x4_t a3,int32x4_t * o0,int32x4_t * o1,int32x4_t * o2,int32x4_t * o3)971 static INLINE void transpose_elems_s32_4x4(const int32x4_t a0,
972                                            const int32x4_t a1,
973                                            const int32x4_t a2,
974                                            const int32x4_t a3, int32x4_t *o0,
975                                            int32x4_t *o1, int32x4_t *o2,
976                                            int32x4_t *o3) {
977   // Swap 32 bit elements. Goes from:
978   // a0: 00 01 02 03
979   // a1: 10 11 12 13
980   // a2: 20 21 22 23
981   // a3: 30 31 32 33
982   // to:
983   // b0.val[0]: 00 10 02 12
984   // b0.val[1]: 01 11 03 13
985   // b1.val[0]: 20 30 22 32
986   // b1.val[1]: 21 31 23 33
987 
988   const int32x4x2_t b0 = vtrnq_s32(a0, a1);
989   const int32x4x2_t b1 = vtrnq_s32(a2, a3);
990 
991   // Swap 64 bit elements resulting in:
992   // c0.val[0]: 00 10 20 30
993   // c0.val[1]: 02 12 22 32
994   // c1.val[0]: 01 11 21 31
995   // c1.val[1]: 03 13 23 33
996 
997   const int32x4x2_t c0 = aom_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
998   const int32x4x2_t c1 = aom_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
999 
1000   *o0 = c0.val[0];
1001   *o1 = c1.val[0];
1002   *o2 = c0.val[1];
1003   *o3 = c1.val[1];
1004 }
1005 
transpose_elems_inplace_s32_4x4(int32x4_t * a0,int32x4_t * a1,int32x4_t * a2,int32x4_t * a3)1006 static INLINE void transpose_elems_inplace_s32_4x4(int32x4_t *a0, int32x4_t *a1,
1007                                                    int32x4_t *a2,
1008                                                    int32x4_t *a3) {
1009   transpose_elems_s32_4x4(*a0, *a1, *a2, *a3, a0, a1, a2, a3);
1010 }
1011 
transpose_arrays_s32_4x4(const int32x4_t * in,int32x4_t * out)1012 static INLINE void transpose_arrays_s32_4x4(const int32x4_t *in,
1013                                             int32x4_t *out) {
1014   transpose_elems_s32_4x4(in[0], in[1], in[2], in[3], &out[0], &out[1], &out[2],
1015                           &out[3]);
1016 }
1017 
transpose_arrays_s32_4nx4n(const int32x4_t * in,int32x4_t * out,const int width,const int height)1018 static AOM_FORCE_INLINE void transpose_arrays_s32_4nx4n(const int32x4_t *in,
1019                                                         int32x4_t *out,
1020                                                         const int width,
1021                                                         const int height) {
1022   const int h = height >> 2;
1023   const int w = width >> 2;
1024   for (int j = 0; j < w; j++) {
1025     for (int i = 0; i < h; i++) {
1026       transpose_arrays_s32_4x4(in + j * height + i * 4,
1027                                out + i * width + j * 4);
1028     }
1029   }
1030 }
1031 
1032 #define TRANSPOSE_ARRAYS_S32_WXH_NEON(w, h)                    \
1033   static AOM_FORCE_INLINE void transpose_arrays_s32_##w##x##h( \
1034       const int32x4_t *in, int32x4_t *out) {                   \
1035     transpose_arrays_s32_4nx4n(in, out, w, h);                 \
1036   }
1037 
1038 TRANSPOSE_ARRAYS_S32_WXH_NEON(4, 8)
1039 TRANSPOSE_ARRAYS_S32_WXH_NEON(4, 16)
1040 TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 4)
1041 TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 8)
1042 TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 16)
1043 TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 32)
1044 TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 8)
1045 TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 16)
1046 TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 32)
1047 TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 64)
1048 TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 8)
1049 TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 16)
1050 TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 32)
1051 TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 64)
1052 TRANSPOSE_ARRAYS_S32_WXH_NEON(64, 16)
1053 TRANSPOSE_ARRAYS_S32_WXH_NEON(64, 32)
1054 
1055 #undef TRANSPOSE_ARRAYS_S32_WXH_NEON
1056 
aom_vtrn1q_s64(int64x2_t a,int64x2_t b)1057 static INLINE int64x2_t aom_vtrn1q_s64(int64x2_t a, int64x2_t b) {
1058 #if AOM_ARCH_AARCH64
1059   return vtrn1q_s64(a, b);
1060 #else
1061   return vcombine_s64(vget_low_s64(a), vget_low_s64(b));
1062 #endif
1063 }
1064 
aom_vtrn2q_s64(int64x2_t a,int64x2_t b)1065 static INLINE int64x2_t aom_vtrn2q_s64(int64x2_t a, int64x2_t b) {
1066 #if AOM_ARCH_AARCH64
1067   return vtrn2q_s64(a, b);
1068 #else
1069   return vcombine_s64(vget_high_s64(a), vget_high_s64(b));
1070 #endif
1071 }
1072 
transpose_elems_s32_4x8(int32x4_t a0,int32x4_t a1,int32x4_t a2,int32x4_t a3,int32x4_t a4,int32x4_t a5,int32x4_t a6,int32x4_t a7,int32x4x2_t * o0,int32x4x2_t * o1,int32x4x2_t * o2,int32x4x2_t * o3)1073 static INLINE void transpose_elems_s32_4x8(int32x4_t a0, int32x4_t a1,
1074                                            int32x4_t a2, int32x4_t a3,
1075                                            int32x4_t a4, int32x4_t a5,
1076                                            int32x4_t a6, int32x4_t a7,
1077                                            int32x4x2_t *o0, int32x4x2_t *o1,
1078                                            int32x4x2_t *o2, int32x4x2_t *o3) {
1079   // Perform a 4 x 8 matrix transpose by building on top of the existing 4 x 4
1080   // matrix transpose implementation:
1081   // [ A ]^T => [ A^T B^T ]
1082   // [ B ]
1083 
1084   transpose_elems_inplace_s32_4x4(&a0, &a1, &a2, &a3);  // A^T
1085   transpose_elems_inplace_s32_4x4(&a4, &a5, &a6, &a7);  // B^T
1086 
1087   o0->val[0] = a0;
1088   o1->val[0] = a1;
1089   o2->val[0] = a2;
1090   o3->val[0] = a3;
1091 
1092   o0->val[1] = a4;
1093   o1->val[1] = a5;
1094   o2->val[1] = a6;
1095   o3->val[1] = a7;
1096 }
1097 
transpose_elems_inplace_s32_8x8(int32x4x2_t * a0,int32x4x2_t * a1,int32x4x2_t * a2,int32x4x2_t * a3,int32x4x2_t * a4,int32x4x2_t * a5,int32x4x2_t * a6,int32x4x2_t * a7)1098 static INLINE void transpose_elems_inplace_s32_8x8(
1099     int32x4x2_t *a0, int32x4x2_t *a1, int32x4x2_t *a2, int32x4x2_t *a3,
1100     int32x4x2_t *a4, int32x4x2_t *a5, int32x4x2_t *a6, int32x4x2_t *a7) {
1101   // Perform an 8 x 8 matrix transpose by building on top of the existing 4 x 4
1102   // matrix transpose implementation:
1103   // [ A B ]^T => [ A^T C^T ]
1104   // [ C D ]      [ B^T D^T ]
1105 
1106   int32x4_t q0_v1 = a0->val[0];
1107   int32x4_t q0_v2 = a1->val[0];
1108   int32x4_t q0_v3 = a2->val[0];
1109   int32x4_t q0_v4 = a3->val[0];
1110 
1111   int32x4_t q1_v1 = a0->val[1];
1112   int32x4_t q1_v2 = a1->val[1];
1113   int32x4_t q1_v3 = a2->val[1];
1114   int32x4_t q1_v4 = a3->val[1];
1115 
1116   int32x4_t q2_v1 = a4->val[0];
1117   int32x4_t q2_v2 = a5->val[0];
1118   int32x4_t q2_v3 = a6->val[0];
1119   int32x4_t q2_v4 = a7->val[0];
1120 
1121   int32x4_t q3_v1 = a4->val[1];
1122   int32x4_t q3_v2 = a5->val[1];
1123   int32x4_t q3_v3 = a6->val[1];
1124   int32x4_t q3_v4 = a7->val[1];
1125 
1126   transpose_elems_inplace_s32_4x4(&q0_v1, &q0_v2, &q0_v3, &q0_v4);  // A^T
1127   transpose_elems_inplace_s32_4x4(&q1_v1, &q1_v2, &q1_v3, &q1_v4);  // B^T
1128   transpose_elems_inplace_s32_4x4(&q2_v1, &q2_v2, &q2_v3, &q2_v4);  // C^T
1129   transpose_elems_inplace_s32_4x4(&q3_v1, &q3_v2, &q3_v3, &q3_v4);  // D^T
1130 
1131   a0->val[0] = q0_v1;
1132   a1->val[0] = q0_v2;
1133   a2->val[0] = q0_v3;
1134   a3->val[0] = q0_v4;
1135 
1136   a0->val[1] = q2_v1;
1137   a1->val[1] = q2_v2;
1138   a2->val[1] = q2_v3;
1139   a3->val[1] = q2_v4;
1140 
1141   a4->val[0] = q1_v1;
1142   a5->val[0] = q1_v2;
1143   a6->val[0] = q1_v3;
1144   a7->val[0] = q1_v4;
1145 
1146   a4->val[1] = q3_v1;
1147   a5->val[1] = q3_v2;
1148   a6->val[1] = q3_v3;
1149   a7->val[1] = q3_v4;
1150 }
1151 
transpose_arrays_s16_4x4(const int16x4_t * const in,int16x4_t * const out)1152 static INLINE void transpose_arrays_s16_4x4(const int16x4_t *const in,
1153                                             int16x4_t *const out) {
1154   int16x4_t a0 = in[0];
1155   int16x4_t a1 = in[1];
1156   int16x4_t a2 = in[2];
1157   int16x4_t a3 = in[3];
1158 
1159   transpose_elems_inplace_s16_4x4(&a0, &a1, &a2, &a3);
1160 
1161   out[0] = a0;
1162   out[1] = a1;
1163   out[2] = a2;
1164   out[3] = a3;
1165 }
1166 
transpose_arrays_s16_4x8(const int16x4_t * const in,int16x8_t * const out)1167 static INLINE void transpose_arrays_s16_4x8(const int16x4_t *const in,
1168                                             int16x8_t *const out) {
1169 #if AOM_ARCH_AARCH64
1170   const int16x8_t a0 = vzip1q_s16(vcombine_s16(in[0], vdup_n_s16(0)),
1171                                   vcombine_s16(in[1], vdup_n_s16(0)));
1172   const int16x8_t a1 = vzip1q_s16(vcombine_s16(in[2], vdup_n_s16(0)),
1173                                   vcombine_s16(in[3], vdup_n_s16(0)));
1174   const int16x8_t a2 = vzip1q_s16(vcombine_s16(in[4], vdup_n_s16(0)),
1175                                   vcombine_s16(in[5], vdup_n_s16(0)));
1176   const int16x8_t a3 = vzip1q_s16(vcombine_s16(in[6], vdup_n_s16(0)),
1177                                   vcombine_s16(in[7], vdup_n_s16(0)));
1178 #else
1179   int16x4x2_t temp;
1180   temp = vzip_s16(in[0], in[1]);
1181   const int16x8_t a0 = vcombine_s16(temp.val[0], temp.val[1]);
1182   temp = vzip_s16(in[2], in[3]);
1183   const int16x8_t a1 = vcombine_s16(temp.val[0], temp.val[1]);
1184   temp = vzip_s16(in[4], in[5]);
1185   const int16x8_t a2 = vcombine_s16(temp.val[0], temp.val[1]);
1186   temp = vzip_s16(in[6], in[7]);
1187   const int16x8_t a3 = vcombine_s16(temp.val[0], temp.val[1]);
1188 #endif
1189 
1190   const int32x4x2_t b02 =
1191       vzipq_s32(vreinterpretq_s32_s16(a0), vreinterpretq_s32_s16(a1));
1192   const int32x4x2_t b13 =
1193       vzipq_s32(vreinterpretq_s32_s16(a2), vreinterpretq_s32_s16(a3));
1194 
1195 #if AOM_ARCH_AARCH64
1196   out[0] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[0]),
1197                                             vreinterpretq_s64_s32(b13.val[0])));
1198   out[1] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[0]),
1199                                             vreinterpretq_s64_s32(b13.val[0])));
1200   out[2] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[1]),
1201                                             vreinterpretq_s64_s32(b13.val[1])));
1202   out[3] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[1]),
1203                                             vreinterpretq_s64_s32(b13.val[1])));
1204 #else
1205   out[0] = vreinterpretq_s16_s32(
1206       vextq_s32(vextq_s32(b02.val[0], b02.val[0], 2), b13.val[0], 2));
1207   out[2] = vreinterpretq_s16_s32(
1208       vextq_s32(vextq_s32(b02.val[1], b02.val[1], 2), b13.val[1], 2));
1209   out[1] = vreinterpretq_s16_s32(
1210       vextq_s32(b02.val[0], vextq_s32(b13.val[0], b13.val[0], 2), 2));
1211   out[3] = vreinterpretq_s16_s32(
1212       vextq_s32(b02.val[1], vextq_s32(b13.val[1], b13.val[1], 2), 2));
1213 #endif
1214 }
1215 
transpose_arrays_s16_8x4(const int16x8_t * const in,int16x4_t * const out)1216 static INLINE void transpose_arrays_s16_8x4(const int16x8_t *const in,
1217                                             int16x4_t *const out) {
1218   // Swap 16 bit elements. Goes from:
1219   // in[0]: 00 01 02 03 04 05 06 07
1220   // in[1]: 10 11 12 13 14 15 16 17
1221   // in[2]: 20 21 22 23 24 25 26 27
1222   // in[3]: 30 31 32 33 34 35 36 37
1223   // to:
1224   // b0.val[0]: 00 10 02 12 04 14 06 16
1225   // b0.val[1]: 01 11 03 13 05 15 07 17
1226   // b1.val[0]: 20 30 22 32 24 34 26 36
1227   // b1.val[1]: 21 31 23 33 25 35 27 37
1228 
1229   const int16x8x2_t b0 = vtrnq_s16(in[0], in[1]);
1230   const int16x8x2_t b1 = vtrnq_s16(in[2], in[3]);
1231 
1232   // Swap 32 bit elements resulting in:
1233   // c0.val[0]: 00 10 20 30 04 14 24 34
1234   // c0.val[1]: 02 12 22 32 06 16 26 36
1235   // c1.val[0]: 01 11 21 31 05 15 25 35
1236   // c1.val[1]: 03 13 23 33 07 17 27 37
1237 
1238   const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_s16(b0.val[0]),
1239                                     vreinterpretq_u32_s16(b1.val[0]));
1240   const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_s16(b0.val[1]),
1241                                     vreinterpretq_u32_s16(b1.val[1]));
1242 
1243   // Unpack 64 bit elements resulting in:
1244   // out[0]: 00 10 20 30
1245   // out[1]: 01 11 21 31
1246   // out[2]: 02 12 22 32
1247   // out[3]: 03 13 23 33
1248   // out[4]: 04 14 24 34
1249   // out[5]: 05 15 25 35
1250   // out[6]: 06 16 26 36
1251   // out[7]: 07 17 27 37
1252 
1253   out[0] = vget_low_s16(vreinterpretq_s16_u32(c0.val[0]));
1254   out[1] = vget_low_s16(vreinterpretq_s16_u32(c1.val[0]));
1255   out[2] = vget_low_s16(vreinterpretq_s16_u32(c0.val[1]));
1256   out[3] = vget_low_s16(vreinterpretq_s16_u32(c1.val[1]));
1257   out[4] = vget_high_s16(vreinterpretq_s16_u32(c0.val[0]));
1258   out[5] = vget_high_s16(vreinterpretq_s16_u32(c1.val[0]));
1259   out[6] = vget_high_s16(vreinterpretq_s16_u32(c0.val[1]));
1260   out[7] = vget_high_s16(vreinterpretq_s16_u32(c1.val[1]));
1261 }
1262 
1263 #endif  // AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
1264