1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #ifndef AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
12 #define AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
13
14 #include <arm_neon.h>
15
16 #include "aom/aom_integer.h" // For AOM_FORCE_INLINE.
17 #include "config/aom_config.h"
18
transpose_elems_u8_8x8(uint8x8_t a0,uint8x8_t a1,uint8x8_t a2,uint8x8_t a3,uint8x8_t a4,uint8x8_t a5,uint8x8_t a6,uint8x8_t a7,uint8x8_t * o0,uint8x8_t * o1,uint8x8_t * o2,uint8x8_t * o3,uint8x8_t * o4,uint8x8_t * o5,uint8x8_t * o6,uint8x8_t * o7)19 static INLINE void transpose_elems_u8_8x8(
20 uint8x8_t a0, uint8x8_t a1, uint8x8_t a2, uint8x8_t a3, uint8x8_t a4,
21 uint8x8_t a5, uint8x8_t a6, uint8x8_t a7, uint8x8_t *o0, uint8x8_t *o1,
22 uint8x8_t *o2, uint8x8_t *o3, uint8x8_t *o4, uint8x8_t *o5, uint8x8_t *o6,
23 uint8x8_t *o7) {
24 // Swap 8 bit elements. Goes from:
25 // a0: 00 01 02 03 04 05 06 07
26 // a1: 10 11 12 13 14 15 16 17
27 // a2: 20 21 22 23 24 25 26 27
28 // a3: 30 31 32 33 34 35 36 37
29 // a4: 40 41 42 43 44 45 46 47
30 // a5: 50 51 52 53 54 55 56 57
31 // a6: 60 61 62 63 64 65 66 67
32 // a7: 70 71 72 73 74 75 76 77
33 // to:
34 // b0.val[0]: 00 10 02 12 04 14 06 16 40 50 42 52 44 54 46 56
35 // b0.val[1]: 01 11 03 13 05 15 07 17 41 51 43 53 45 55 47 57
36 // b1.val[0]: 20 30 22 32 24 34 26 36 60 70 62 72 64 74 66 76
37 // b1.val[1]: 21 31 23 33 25 35 27 37 61 71 63 73 65 75 67 77
38
39 const uint8x16x2_t b0 = vtrnq_u8(vcombine_u8(a0, a4), vcombine_u8(a1, a5));
40 const uint8x16x2_t b1 = vtrnq_u8(vcombine_u8(a2, a6), vcombine_u8(a3, a7));
41
42 // Swap 16 bit elements resulting in:
43 // c0.val[0]: 00 10 20 30 04 14 24 34 40 50 60 70 44 54 64 74
44 // c0.val[1]: 02 12 22 32 06 16 26 36 42 52 62 72 46 56 66 76
45 // c1.val[0]: 01 11 21 31 05 15 25 35 41 51 61 71 45 55 65 75
46 // c1.val[1]: 03 13 23 33 07 17 27 37 43 53 63 73 47 57 67 77
47
48 const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
49 vreinterpretq_u16_u8(b1.val[0]));
50 const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
51 vreinterpretq_u16_u8(b1.val[1]));
52
53 // Unzip 32 bit elements resulting in:
54 // d0.val[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
55 // d0.val[1]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
56 // d1.val[0]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
57 // d1.val[1]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
58 const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
59 vreinterpretq_u32_u16(c1.val[0]));
60 const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
61 vreinterpretq_u32_u16(c1.val[1]));
62
63 *o0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
64 *o1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
65 *o2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
66 *o3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
67 *o4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
68 *o5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
69 *o6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
70 *o7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
71 }
72
transpose_elems_inplace_u8_8x8(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3,uint8x8_t * a4,uint8x8_t * a5,uint8x8_t * a6,uint8x8_t * a7)73 static INLINE void transpose_elems_inplace_u8_8x8(uint8x8_t *a0, uint8x8_t *a1,
74 uint8x8_t *a2, uint8x8_t *a3,
75 uint8x8_t *a4, uint8x8_t *a5,
76 uint8x8_t *a6,
77 uint8x8_t *a7) {
78 transpose_elems_u8_8x8(*a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7, a0, a1, a2, a3,
79 a4, a5, a6, a7);
80 }
81
transpose_arrays_u8_8x8(const uint8x8_t * in,uint8x8_t * out)82 static INLINE void transpose_arrays_u8_8x8(const uint8x8_t *in,
83 uint8x8_t *out) {
84 transpose_elems_u8_8x8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],
85 &out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
86 &out[6], &out[7]);
87 }
88
transpose_arrays_u8_8x16(const uint8x8_t * x,uint8x16_t * d)89 static AOM_FORCE_INLINE void transpose_arrays_u8_8x16(const uint8x8_t *x,
90 uint8x16_t *d) {
91 uint8x8x2_t w0 = vzip_u8(x[0], x[1]);
92 uint8x8x2_t w1 = vzip_u8(x[2], x[3]);
93 uint8x8x2_t w2 = vzip_u8(x[4], x[5]);
94 uint8x8x2_t w3 = vzip_u8(x[6], x[7]);
95
96 uint8x8x2_t w8 = vzip_u8(x[8], x[9]);
97 uint8x8x2_t w9 = vzip_u8(x[10], x[11]);
98 uint8x8x2_t w10 = vzip_u8(x[12], x[13]);
99 uint8x8x2_t w11 = vzip_u8(x[14], x[15]);
100
101 uint16x4x2_t w4 =
102 vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]));
103 uint16x4x2_t w5 =
104 vzip_u16(vreinterpret_u16_u8(w2.val[0]), vreinterpret_u16_u8(w3.val[0]));
105 uint16x4x2_t w12 =
106 vzip_u16(vreinterpret_u16_u8(w8.val[0]), vreinterpret_u16_u8(w9.val[0]));
107 uint16x4x2_t w13 = vzip_u16(vreinterpret_u16_u8(w10.val[0]),
108 vreinterpret_u16_u8(w11.val[0]));
109
110 uint32x2x2_t w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
111 vreinterpret_u32_u16(w5.val[0]));
112 uint32x2x2_t w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
113 vreinterpret_u32_u16(w5.val[1]));
114 uint32x2x2_t w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]),
115 vreinterpret_u32_u16(w13.val[0]));
116 uint32x2x2_t w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]),
117 vreinterpret_u32_u16(w13.val[1]));
118
119 // Store first 4-line result
120 d[0] = vreinterpretq_u8_u32(vcombine_u32(w6.val[0], w14.val[0]));
121 d[1] = vreinterpretq_u8_u32(vcombine_u32(w6.val[1], w14.val[1]));
122 d[2] = vreinterpretq_u8_u32(vcombine_u32(w7.val[0], w15.val[0]));
123 d[3] = vreinterpretq_u8_u32(vcombine_u32(w7.val[1], w15.val[1]));
124
125 w4 = vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1]));
126 w5 = vzip_u16(vreinterpret_u16_u8(w2.val[1]), vreinterpret_u16_u8(w3.val[1]));
127 w12 =
128 vzip_u16(vreinterpret_u16_u8(w8.val[1]), vreinterpret_u16_u8(w9.val[1]));
129 w13 = vzip_u16(vreinterpret_u16_u8(w10.val[1]),
130 vreinterpret_u16_u8(w11.val[1]));
131
132 w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
133 vreinterpret_u32_u16(w5.val[0]));
134 w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
135 vreinterpret_u32_u16(w5.val[1]));
136 w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]),
137 vreinterpret_u32_u16(w13.val[0]));
138 w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]),
139 vreinterpret_u32_u16(w13.val[1]));
140
141 // Store second 4-line result
142 d[4] = vreinterpretq_u8_u32(vcombine_u32(w6.val[0], w14.val[0]));
143 d[5] = vreinterpretq_u8_u32(vcombine_u32(w6.val[1], w14.val[1]));
144 d[6] = vreinterpretq_u8_u32(vcombine_u32(w7.val[0], w15.val[0]));
145 d[7] = vreinterpretq_u8_u32(vcombine_u32(w7.val[1], w15.val[1]));
146 }
147
transpose_arrays_u8_16x8(const uint8x16_t * x,uint8x8_t * d)148 static AOM_FORCE_INLINE void transpose_arrays_u8_16x8(const uint8x16_t *x,
149 uint8x8_t *d) {
150 uint8x16x2_t w0 = vzipq_u8(x[0], x[1]);
151 uint8x16x2_t w1 = vzipq_u8(x[2], x[3]);
152 uint8x16x2_t w2 = vzipq_u8(x[4], x[5]);
153 uint8x16x2_t w3 = vzipq_u8(x[6], x[7]);
154
155 uint16x8x2_t w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
156 vreinterpretq_u16_u8(w1.val[0]));
157 uint16x8x2_t w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
158 vreinterpretq_u16_u8(w3.val[0]));
159 uint16x8x2_t w6 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
160 vreinterpretq_u16_u8(w1.val[1]));
161 uint16x8x2_t w7 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
162 vreinterpretq_u16_u8(w3.val[1]));
163
164 uint32x4x2_t w8 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]),
165 vreinterpretq_u32_u16(w5.val[0]));
166 uint32x4x2_t w9 = vzipq_u32(vreinterpretq_u32_u16(w6.val[0]),
167 vreinterpretq_u32_u16(w7.val[0]));
168 uint32x4x2_t w10 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]),
169 vreinterpretq_u32_u16(w5.val[1]));
170 uint32x4x2_t w11 = vzipq_u32(vreinterpretq_u32_u16(w6.val[1]),
171 vreinterpretq_u32_u16(w7.val[1]));
172
173 d[0] = vreinterpret_u8_u32(vget_low_u32(w8.val[0]));
174 d[1] = vreinterpret_u8_u32(vget_high_u32(w8.val[0]));
175 d[2] = vreinterpret_u8_u32(vget_low_u32(w8.val[1]));
176 d[3] = vreinterpret_u8_u32(vget_high_u32(w8.val[1]));
177 d[4] = vreinterpret_u8_u32(vget_low_u32(w10.val[0]));
178 d[5] = vreinterpret_u8_u32(vget_high_u32(w10.val[0]));
179 d[6] = vreinterpret_u8_u32(vget_low_u32(w10.val[1]));
180 d[7] = vreinterpret_u8_u32(vget_high_u32(w10.val[1]));
181 d[8] = vreinterpret_u8_u32(vget_low_u32(w9.val[0]));
182 d[9] = vreinterpret_u8_u32(vget_high_u32(w9.val[0]));
183 d[10] = vreinterpret_u8_u32(vget_low_u32(w9.val[1]));
184 d[11] = vreinterpret_u8_u32(vget_high_u32(w9.val[1]));
185 d[12] = vreinterpret_u8_u32(vget_low_u32(w11.val[0]));
186 d[13] = vreinterpret_u8_u32(vget_high_u32(w11.val[0]));
187 d[14] = vreinterpret_u8_u32(vget_low_u32(w11.val[1]));
188 d[15] = vreinterpret_u8_u32(vget_high_u32(w11.val[1]));
189 }
190
aom_vtrnq_u64_to_u16(uint32x4_t a0,uint32x4_t a1)191 static INLINE uint16x8x2_t aom_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) {
192 uint16x8x2_t b0;
193 #if AOM_ARCH_AARCH64
194 b0.val[0] = vreinterpretq_u16_u64(
195 vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
196 b0.val[1] = vreinterpretq_u16_u64(
197 vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
198 #else
199 b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
200 vreinterpret_u16_u32(vget_low_u32(a1)));
201 b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
202 vreinterpret_u16_u32(vget_high_u32(a1)));
203 #endif
204 return b0;
205 }
206
transpose_arrays_u8_16x16(const uint8x16_t * x,uint8x16_t * d)207 static INLINE void transpose_arrays_u8_16x16(const uint8x16_t *x,
208 uint8x16_t *d) {
209 uint8x16x2_t w0 = vzipq_u8(x[0], x[1]);
210 uint8x16x2_t w1 = vzipq_u8(x[2], x[3]);
211 uint8x16x2_t w2 = vzipq_u8(x[4], x[5]);
212 uint8x16x2_t w3 = vzipq_u8(x[6], x[7]);
213
214 uint8x16x2_t w4 = vzipq_u8(x[8], x[9]);
215 uint8x16x2_t w5 = vzipq_u8(x[10], x[11]);
216 uint8x16x2_t w6 = vzipq_u8(x[12], x[13]);
217 uint8x16x2_t w7 = vzipq_u8(x[14], x[15]);
218
219 uint16x8x2_t w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
220 vreinterpretq_u16_u8(w1.val[0]));
221 uint16x8x2_t w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
222 vreinterpretq_u16_u8(w3.val[0]));
223 uint16x8x2_t w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[0]),
224 vreinterpretq_u16_u8(w5.val[0]));
225 uint16x8x2_t w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[0]),
226 vreinterpretq_u16_u8(w7.val[0]));
227
228 uint32x4x2_t w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]),
229 vreinterpretq_u32_u16(w9.val[0]));
230 uint32x4x2_t w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]),
231 vreinterpretq_u32_u16(w11.val[0]));
232 uint32x4x2_t w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]),
233 vreinterpretq_u32_u16(w9.val[1]));
234 uint32x4x2_t w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]),
235 vreinterpretq_u32_u16(w11.val[1]));
236
237 uint16x8x2_t d01 = aom_vtrnq_u64_to_u16(w12.val[0], w13.val[0]);
238 d[0] = vreinterpretq_u8_u16(d01.val[0]);
239 d[1] = vreinterpretq_u8_u16(d01.val[1]);
240 uint16x8x2_t d23 = aom_vtrnq_u64_to_u16(w12.val[1], w13.val[1]);
241 d[2] = vreinterpretq_u8_u16(d23.val[0]);
242 d[3] = vreinterpretq_u8_u16(d23.val[1]);
243 uint16x8x2_t d45 = aom_vtrnq_u64_to_u16(w14.val[0], w15.val[0]);
244 d[4] = vreinterpretq_u8_u16(d45.val[0]);
245 d[5] = vreinterpretq_u8_u16(d45.val[1]);
246 uint16x8x2_t d67 = aom_vtrnq_u64_to_u16(w14.val[1], w15.val[1]);
247 d[6] = vreinterpretq_u8_u16(d67.val[0]);
248 d[7] = vreinterpretq_u8_u16(d67.val[1]);
249
250 // upper half
251 w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
252 vreinterpretq_u16_u8(w1.val[1]));
253 w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
254 vreinterpretq_u16_u8(w3.val[1]));
255 w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[1]),
256 vreinterpretq_u16_u8(w5.val[1]));
257 w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[1]),
258 vreinterpretq_u16_u8(w7.val[1]));
259
260 w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]),
261 vreinterpretq_u32_u16(w9.val[0]));
262 w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]),
263 vreinterpretq_u32_u16(w11.val[0]));
264 w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]),
265 vreinterpretq_u32_u16(w9.val[1]));
266 w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]),
267 vreinterpretq_u32_u16(w11.val[1]));
268
269 d01 = aom_vtrnq_u64_to_u16(w12.val[0], w13.val[0]);
270 d[8] = vreinterpretq_u8_u16(d01.val[0]);
271 d[9] = vreinterpretq_u8_u16(d01.val[1]);
272 d23 = aom_vtrnq_u64_to_u16(w12.val[1], w13.val[1]);
273 d[10] = vreinterpretq_u8_u16(d23.val[0]);
274 d[11] = vreinterpretq_u8_u16(d23.val[1]);
275 d45 = aom_vtrnq_u64_to_u16(w14.val[0], w15.val[0]);
276 d[12] = vreinterpretq_u8_u16(d45.val[0]);
277 d[13] = vreinterpretq_u8_u16(d45.val[1]);
278 d67 = aom_vtrnq_u64_to_u16(w14.val[1], w15.val[1]);
279 d[14] = vreinterpretq_u8_u16(d67.val[0]);
280 d[15] = vreinterpretq_u8_u16(d67.val[1]);
281 }
282
transpose_arrays_u8_32x16(const uint8x16x2_t * x,uint8x16_t * d)283 static AOM_FORCE_INLINE void transpose_arrays_u8_32x16(const uint8x16x2_t *x,
284 uint8x16_t *d) {
285 uint8x16_t x2[32];
286 for (int i = 0; i < 16; ++i) {
287 x2[i] = x[i].val[0];
288 x2[i + 16] = x[i].val[1];
289 }
290 transpose_arrays_u8_16x16(x2, d);
291 transpose_arrays_u8_16x16(x2 + 16, d + 16);
292 }
293
transpose_elems_inplace_u8_8x4(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3)294 static INLINE void transpose_elems_inplace_u8_8x4(uint8x8_t *a0, uint8x8_t *a1,
295 uint8x8_t *a2,
296 uint8x8_t *a3) {
297 // Swap 8 bit elements. Goes from:
298 // a0: 00 01 02 03 04 05 06 07
299 // a1: 10 11 12 13 14 15 16 17
300 // a2: 20 21 22 23 24 25 26 27
301 // a3: 30 31 32 33 34 35 36 37
302 // to:
303 // b0.val[0]: 00 10 02 12 04 14 06 16
304 // b0.val[1]: 01 11 03 13 05 15 07 17
305 // b1.val[0]: 20 30 22 32 24 34 26 36
306 // b1.val[1]: 21 31 23 33 25 35 27 37
307
308 const uint8x8x2_t b0 = vtrn_u8(*a0, *a1);
309 const uint8x8x2_t b1 = vtrn_u8(*a2, *a3);
310
311 // Swap 16 bit elements resulting in:
312 // c0.val[0]: 00 10 20 30 04 14 24 34
313 // c0.val[1]: 02 12 22 32 06 16 26 36
314 // c1.val[0]: 01 11 21 31 05 15 25 35
315 // c1.val[1]: 03 13 23 33 07 17 27 37
316
317 const uint16x4x2_t c0 =
318 vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0]));
319 const uint16x4x2_t c1 =
320 vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1]));
321
322 *a0 = vreinterpret_u8_u16(c0.val[0]);
323 *a1 = vreinterpret_u8_u16(c1.val[0]);
324 *a2 = vreinterpret_u8_u16(c0.val[1]);
325 *a3 = vreinterpret_u8_u16(c1.val[1]);
326 }
327
transpose_elems_inplace_u8_4x4(uint8x8_t * a0,uint8x8_t * a1)328 static INLINE void transpose_elems_inplace_u8_4x4(uint8x8_t *a0,
329 uint8x8_t *a1) {
330 // Swap 16 bit elements. Goes from:
331 // a0: 00 01 02 03 10 11 12 13
332 // a1: 20 21 22 23 30 31 32 33
333 // to:
334 // b0.val[0]: 00 01 20 21 10 11 30 31
335 // b0.val[1]: 02 03 22 23 12 13 32 33
336
337 const uint16x4x2_t b0 =
338 vtrn_u16(vreinterpret_u16_u8(*a0), vreinterpret_u16_u8(*a1));
339
340 // Swap 32 bit elements resulting in:
341 // c0.val[0]: 00 01 20 21 02 03 22 23
342 // c0.val[1]: 10 11 30 31 12 13 32 33
343
344 const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
345 vreinterpret_u32_u16(b0.val[1]));
346
347 // Swap 8 bit elements resulting in:
348 // d0.val[0]: 00 10 20 30 02 12 22 32
349 // d0.val[1]: 01 11 21 31 03 13 23 33
350
351 const uint8x8x2_t d0 =
352 vtrn_u8(vreinterpret_u8_u32(c0.val[0]), vreinterpret_u8_u32(c0.val[1]));
353
354 *a0 = d0.val[0];
355 *a1 = d0.val[1];
356 }
357
transpose_elems_u8_4x8(uint8x8_t a0,uint8x8_t a1,uint8x8_t a2,uint8x8_t a3,uint8x8_t a4,uint8x8_t a5,uint8x8_t a6,uint8x8_t a7,uint8x8_t * o0,uint8x8_t * o1,uint8x8_t * o2,uint8x8_t * o3)358 static INLINE void transpose_elems_u8_4x8(uint8x8_t a0, uint8x8_t a1,
359 uint8x8_t a2, uint8x8_t a3,
360 uint8x8_t a4, uint8x8_t a5,
361 uint8x8_t a6, uint8x8_t a7,
362 uint8x8_t *o0, uint8x8_t *o1,
363 uint8x8_t *o2, uint8x8_t *o3) {
364 // Swap 32 bit elements. Goes from:
365 // a0: 00 01 02 03 XX XX XX XX
366 // a1: 10 11 12 13 XX XX XX XX
367 // a2: 20 21 22 23 XX XX XX XX
368 // a3; 30 31 32 33 XX XX XX XX
369 // a4: 40 41 42 43 XX XX XX XX
370 // a5: 50 51 52 53 XX XX XX XX
371 // a6: 60 61 62 63 XX XX XX XX
372 // a7: 70 71 72 73 XX XX XX XX
373 // to:
374 // b0.val[0]: 00 01 02 03 40 41 42 43
375 // b1.val[0]: 10 11 12 13 50 51 52 53
376 // b2.val[0]: 20 21 22 23 60 61 62 63
377 // b3.val[0]: 30 31 32 33 70 71 72 73
378
379 const uint32x2x2_t b0 =
380 vtrn_u32(vreinterpret_u32_u8(a0), vreinterpret_u32_u8(a4));
381 const uint32x2x2_t b1 =
382 vtrn_u32(vreinterpret_u32_u8(a1), vreinterpret_u32_u8(a5));
383 const uint32x2x2_t b2 =
384 vtrn_u32(vreinterpret_u32_u8(a2), vreinterpret_u32_u8(a6));
385 const uint32x2x2_t b3 =
386 vtrn_u32(vreinterpret_u32_u8(a3), vreinterpret_u32_u8(a7));
387
388 // Swap 16 bit elements resulting in:
389 // c0.val[0]: 00 01 20 21 40 41 60 61
390 // c0.val[1]: 02 03 22 23 42 43 62 63
391 // c1.val[0]: 10 11 30 31 50 51 70 71
392 // c1.val[1]: 12 13 32 33 52 53 72 73
393
394 const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]),
395 vreinterpret_u16_u32(b2.val[0]));
396 const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]),
397 vreinterpret_u16_u32(b3.val[0]));
398
399 // Swap 8 bit elements resulting in:
400 // d0.val[0]: 00 10 20 30 40 50 60 70
401 // d0.val[1]: 01 11 21 31 41 51 61 71
402 // d1.val[0]: 02 12 22 32 42 52 62 72
403 // d1.val[1]: 03 13 23 33 43 53 63 73
404
405 const uint8x8x2_t d0 =
406 vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0]));
407 const uint8x8x2_t d1 =
408 vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1]));
409
410 *o0 = d0.val[0];
411 *o1 = d0.val[1];
412 *o2 = d1.val[0];
413 *o3 = d1.val[1];
414 }
415
transpose_array_inplace_u16_4x4(uint16x4_t a[4])416 static INLINE void transpose_array_inplace_u16_4x4(uint16x4_t a[4]) {
417 // Input:
418 // 00 01 02 03
419 // 10 11 12 13
420 // 20 21 22 23
421 // 30 31 32 33
422
423 // b:
424 // 00 10 02 12
425 // 01 11 03 13
426 const uint16x4x2_t b = vtrn_u16(a[0], a[1]);
427 // c:
428 // 20 30 22 32
429 // 21 31 23 33
430 const uint16x4x2_t c = vtrn_u16(a[2], a[3]);
431 // d:
432 // 00 10 20 30
433 // 02 12 22 32
434 const uint32x2x2_t d =
435 vtrn_u32(vreinterpret_u32_u16(b.val[0]), vreinterpret_u32_u16(c.val[0]));
436 // e:
437 // 01 11 21 31
438 // 03 13 23 33
439 const uint32x2x2_t e =
440 vtrn_u32(vreinterpret_u32_u16(b.val[1]), vreinterpret_u32_u16(c.val[1]));
441
442 // Output:
443 // 00 10 20 30
444 // 01 11 21 31
445 // 02 12 22 32
446 // 03 13 23 33
447 a[0] = vreinterpret_u16_u32(d.val[0]);
448 a[1] = vreinterpret_u16_u32(e.val[0]);
449 a[2] = vreinterpret_u16_u32(d.val[1]);
450 a[3] = vreinterpret_u16_u32(e.val[1]);
451 }
452
transpose_array_inplace_u16_4x8(uint16x8_t a[4])453 static INLINE void transpose_array_inplace_u16_4x8(uint16x8_t a[4]) {
454 // 4x8 Input:
455 // a[0]: 00 01 02 03 04 05 06 07
456 // a[1]: 10 11 12 13 14 15 16 17
457 // a[2]: 20 21 22 23 24 25 26 27
458 // a[3]: 30 31 32 33 34 35 36 37
459
460 // b0.val[0]: 00 10 02 12 04 14 06 16
461 // b0.val[1]: 01 11 03 13 05 15 07 17
462 // b1.val[0]: 20 30 22 32 24 34 26 36
463 // b1.val[1]: 21 31 23 33 25 35 27 37
464 const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
465 const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
466
467 // c0.val[0]: 00 10 20 30 04 14 24 34
468 // c0.val[1]: 02 12 22 32 06 16 26 36
469 // c1.val[0]: 01 11 21 31 05 15 25 35
470 // c1.val[1]: 03 13 23 33 07 17 27 37
471 const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
472 vreinterpretq_u32_u16(b1.val[0]));
473 const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
474 vreinterpretq_u32_u16(b1.val[1]));
475
476 // 8x4 Output:
477 // a[0]: 00 10 20 30 04 14 24 34
478 // a[1]: 01 11 21 31 05 15 25 35
479 // a[2]: 02 12 22 32 06 16 26 36
480 // a[3]: 03 13 23 33 07 17 27 37
481 a[0] = vreinterpretq_u16_u32(c0.val[0]);
482 a[1] = vreinterpretq_u16_u32(c1.val[0]);
483 a[2] = vreinterpretq_u16_u32(c0.val[1]);
484 a[3] = vreinterpretq_u16_u32(c1.val[1]);
485 }
486
487 // Special transpose for loop filter.
488 // 4x8 Input:
489 // p_q: p3 p2 p1 p0 q0 q1 q2 q3
490 // a[0]: 00 01 02 03 04 05 06 07
491 // a[1]: 10 11 12 13 14 15 16 17
492 // a[2]: 20 21 22 23 24 25 26 27
493 // a[3]: 30 31 32 33 34 35 36 37
494 // 8x4 Output:
495 // a[0]: 03 13 23 33 04 14 24 34 p0q0
496 // a[1]: 02 12 22 32 05 15 25 35 p1q1
497 // a[2]: 01 11 21 31 06 16 26 36 p2q2
498 // a[3]: 00 10 20 30 07 17 27 37 p3q3
499 // Direct reapplication of the function will reset the high halves, but
500 // reverse the low halves:
501 // p_q: p0 p1 p2 p3 q0 q1 q2 q3
502 // a[0]: 33 32 31 30 04 05 06 07
503 // a[1]: 23 22 21 20 14 15 16 17
504 // a[2]: 13 12 11 10 24 25 26 27
505 // a[3]: 03 02 01 00 34 35 36 37
506 // Simply reordering the inputs (3, 2, 1, 0) will reset the low halves, but
507 // reverse the high halves.
508 // The standard transpose_u16_4x8q will produce the same reversals, but with the
509 // order of the low halves also restored relative to the high halves. This is
510 // preferable because it puts all values from the same source row back together,
511 // but some post-processing is inevitable.
loop_filter_transpose_u16_4x8q(uint16x8_t a[4])512 static INLINE void loop_filter_transpose_u16_4x8q(uint16x8_t a[4]) {
513 // b0.val[0]: 00 10 02 12 04 14 06 16
514 // b0.val[1]: 01 11 03 13 05 15 07 17
515 // b1.val[0]: 20 30 22 32 24 34 26 36
516 // b1.val[1]: 21 31 23 33 25 35 27 37
517 const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
518 const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
519
520 // Reverse odd vectors to bring the appropriate items to the front of zips.
521 // b0.val[0]: 00 10 02 12 04 14 06 16
522 // r0 : 03 13 01 11 07 17 05 15
523 // b1.val[0]: 20 30 22 32 24 34 26 36
524 // r1 : 23 33 21 31 27 37 25 35
525 const uint32x4_t r0 = vrev64q_u32(vreinterpretq_u32_u16(b0.val[1]));
526 const uint32x4_t r1 = vrev64q_u32(vreinterpretq_u32_u16(b1.val[1]));
527
528 // Zip to complete the halves.
529 // c0.val[0]: 00 10 20 30 02 12 22 32 p3p1
530 // c0.val[1]: 04 14 24 34 06 16 26 36 q0q2
531 // c1.val[0]: 03 13 23 33 01 11 21 31 p0p2
532 // c1.val[1]: 07 17 27 37 05 15 25 35 q3q1
533 const uint32x4x2_t c0 = vzipq_u32(vreinterpretq_u32_u16(b0.val[0]),
534 vreinterpretq_u32_u16(b1.val[0]));
535 const uint32x4x2_t c1 = vzipq_u32(r0, r1);
536
537 // d0.val[0]: 00 10 20 30 07 17 27 37 p3q3
538 // d0.val[1]: 02 12 22 32 05 15 25 35 p1q1
539 // d1.val[0]: 03 13 23 33 04 14 24 34 p0q0
540 // d1.val[1]: 01 11 21 31 06 16 26 36 p2q2
541 const uint16x8x2_t d0 = aom_vtrnq_u64_to_u16(c0.val[0], c1.val[1]);
542 // The third row of c comes first here to swap p2 with q0.
543 const uint16x8x2_t d1 = aom_vtrnq_u64_to_u16(c1.val[0], c0.val[1]);
544
545 // 8x4 Output:
546 // a[0]: 03 13 23 33 04 14 24 34 p0q0
547 // a[1]: 02 12 22 32 05 15 25 35 p1q1
548 // a[2]: 01 11 21 31 06 16 26 36 p2q2
549 // a[3]: 00 10 20 30 07 17 27 37 p3q3
550 a[0] = d1.val[0]; // p0q0
551 a[1] = d0.val[1]; // p1q1
552 a[2] = d1.val[1]; // p2q2
553 a[3] = d0.val[0]; // p3q3
554 }
555
transpose_elems_u16_4x8(const uint16x4_t a0,const uint16x4_t a1,const uint16x4_t a2,const uint16x4_t a3,const uint16x4_t a4,const uint16x4_t a5,const uint16x4_t a6,const uint16x4_t a7,uint16x8_t * o0,uint16x8_t * o1,uint16x8_t * o2,uint16x8_t * o3)556 static INLINE void transpose_elems_u16_4x8(
557 const uint16x4_t a0, const uint16x4_t a1, const uint16x4_t a2,
558 const uint16x4_t a3, const uint16x4_t a4, const uint16x4_t a5,
559 const uint16x4_t a6, const uint16x4_t a7, uint16x8_t *o0, uint16x8_t *o1,
560 uint16x8_t *o2, uint16x8_t *o3) {
561 // Combine rows. Goes from:
562 // a0: 00 01 02 03
563 // a1: 10 11 12 13
564 // a2: 20 21 22 23
565 // a3: 30 31 32 33
566 // a4: 40 41 42 43
567 // a5: 50 51 52 53
568 // a6: 60 61 62 63
569 // a7: 70 71 72 73
570 // to:
571 // b0: 00 01 02 03 40 41 42 43
572 // b1: 10 11 12 13 50 51 52 53
573 // b2: 20 21 22 23 60 61 62 63
574 // b3: 30 31 32 33 70 71 72 73
575
576 const uint16x8_t b0 = vcombine_u16(a0, a4);
577 const uint16x8_t b1 = vcombine_u16(a1, a5);
578 const uint16x8_t b2 = vcombine_u16(a2, a6);
579 const uint16x8_t b3 = vcombine_u16(a3, a7);
580
581 // Swap 16 bit elements resulting in:
582 // c0.val[0]: 00 10 02 12 40 50 42 52
583 // c0.val[1]: 01 11 03 13 41 51 43 53
584 // c1.val[0]: 20 30 22 32 60 70 62 72
585 // c1.val[1]: 21 31 23 33 61 71 63 73
586
587 const uint16x8x2_t c0 = vtrnq_u16(b0, b1);
588 const uint16x8x2_t c1 = vtrnq_u16(b2, b3);
589
590 // Swap 32 bit elements resulting in:
591 // d0.val[0]: 00 10 20 30 40 50 60 70
592 // d0.val[1]: 02 12 22 32 42 52 62 72
593 // d1.val[0]: 01 11 21 31 41 51 61 71
594 // d1.val[1]: 03 13 23 33 43 53 63 73
595
596 const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
597 vreinterpretq_u32_u16(c1.val[0]));
598 const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
599 vreinterpretq_u32_u16(c1.val[1]));
600
601 *o0 = vreinterpretq_u16_u32(d0.val[0]);
602 *o1 = vreinterpretq_u16_u32(d1.val[0]);
603 *o2 = vreinterpretq_u16_u32(d0.val[1]);
604 *o3 = vreinterpretq_u16_u32(d1.val[1]);
605 }
606
transpose_elems_s16_4x8(const int16x4_t a0,const int16x4_t a1,const int16x4_t a2,const int16x4_t a3,const int16x4_t a4,const int16x4_t a5,const int16x4_t a6,const int16x4_t a7,int16x8_t * o0,int16x8_t * o1,int16x8_t * o2,int16x8_t * o3)607 static INLINE void transpose_elems_s16_4x8(
608 const int16x4_t a0, const int16x4_t a1, const int16x4_t a2,
609 const int16x4_t a3, const int16x4_t a4, const int16x4_t a5,
610 const int16x4_t a6, const int16x4_t a7, int16x8_t *o0, int16x8_t *o1,
611 int16x8_t *o2, int16x8_t *o3) {
612 // Combine rows. Goes from:
613 // a0: 00 01 02 03
614 // a1: 10 11 12 13
615 // a2: 20 21 22 23
616 // a3: 30 31 32 33
617 // a4: 40 41 42 43
618 // a5: 50 51 52 53
619 // a6: 60 61 62 63
620 // a7: 70 71 72 73
621 // to:
622 // b0: 00 01 02 03 40 41 42 43
623 // b1: 10 11 12 13 50 51 52 53
624 // b2: 20 21 22 23 60 61 62 63
625 // b3: 30 31 32 33 70 71 72 73
626
627 const int16x8_t b0 = vcombine_s16(a0, a4);
628 const int16x8_t b1 = vcombine_s16(a1, a5);
629 const int16x8_t b2 = vcombine_s16(a2, a6);
630 const int16x8_t b3 = vcombine_s16(a3, a7);
631
632 // Swap 16 bit elements resulting in:
633 // c0.val[0]: 00 10 02 12 40 50 42 52
634 // c0.val[1]: 01 11 03 13 41 51 43 53
635 // c1.val[0]: 20 30 22 32 60 70 62 72
636 // c1.val[1]: 21 31 23 33 61 71 63 73
637
638 const int16x8x2_t c0 = vtrnq_s16(b0, b1);
639 const int16x8x2_t c1 = vtrnq_s16(b2, b3);
640
641 // Swap 32 bit elements resulting in:
642 // d0.val[0]: 00 10 20 30 40 50 60 70
643 // d0.val[1]: 02 12 22 32 42 52 62 72
644 // d1.val[0]: 01 11 21 31 41 51 61 71
645 // d1.val[1]: 03 13 23 33 43 53 63 73
646
647 const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
648 vreinterpretq_s32_s16(c1.val[0]));
649 const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
650 vreinterpretq_s32_s16(c1.val[1]));
651
652 *o0 = vreinterpretq_s16_s32(d0.val[0]);
653 *o1 = vreinterpretq_s16_s32(d1.val[0]);
654 *o2 = vreinterpretq_s16_s32(d0.val[1]);
655 *o3 = vreinterpretq_s16_s32(d1.val[1]);
656 }
657
transpose_elems_inplace_u16_8x8(uint16x8_t * a0,uint16x8_t * a1,uint16x8_t * a2,uint16x8_t * a3,uint16x8_t * a4,uint16x8_t * a5,uint16x8_t * a6,uint16x8_t * a7)658 static INLINE void transpose_elems_inplace_u16_8x8(
659 uint16x8_t *a0, uint16x8_t *a1, uint16x8_t *a2, uint16x8_t *a3,
660 uint16x8_t *a4, uint16x8_t *a5, uint16x8_t *a6, uint16x8_t *a7) {
661 // Swap 16 bit elements. Goes from:
662 // a0: 00 01 02 03 04 05 06 07
663 // a1: 10 11 12 13 14 15 16 17
664 // a2: 20 21 22 23 24 25 26 27
665 // a3: 30 31 32 33 34 35 36 37
666 // a4: 40 41 42 43 44 45 46 47
667 // a5: 50 51 52 53 54 55 56 57
668 // a6: 60 61 62 63 64 65 66 67
669 // a7: 70 71 72 73 74 75 76 77
670 // to:
671 // b0.val[0]: 00 10 02 12 04 14 06 16
672 // b0.val[1]: 01 11 03 13 05 15 07 17
673 // b1.val[0]: 20 30 22 32 24 34 26 36
674 // b1.val[1]: 21 31 23 33 25 35 27 37
675 // b2.val[0]: 40 50 42 52 44 54 46 56
676 // b2.val[1]: 41 51 43 53 45 55 47 57
677 // b3.val[0]: 60 70 62 72 64 74 66 76
678 // b3.val[1]: 61 71 63 73 65 75 67 77
679
680 const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
681 const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
682 const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5);
683 const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7);
684
685 // Swap 32 bit elements resulting in:
686 // c0.val[0]: 00 10 20 30 04 14 24 34
687 // c0.val[1]: 02 12 22 32 06 16 26 36
688 // c1.val[0]: 01 11 21 31 05 15 25 35
689 // c1.val[1]: 03 13 23 33 07 17 27 37
690 // c2.val[0]: 40 50 60 70 44 54 64 74
691 // c2.val[1]: 42 52 62 72 46 56 66 76
692 // c3.val[0]: 41 51 61 71 45 55 65 75
693 // c3.val[1]: 43 53 63 73 47 57 67 77
694
695 const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
696 vreinterpretq_u32_u16(b1.val[0]));
697 const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
698 vreinterpretq_u32_u16(b1.val[1]));
699 const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]),
700 vreinterpretq_u32_u16(b3.val[0]));
701 const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]),
702 vreinterpretq_u32_u16(b3.val[1]));
703
704 // Swap 64 bit elements resulting in:
705 // d0.val[0]: 00 10 20 30 40 50 60 70
706 // d0.val[1]: 04 14 24 34 44 54 64 74
707 // d1.val[0]: 01 11 21 31 41 51 61 71
708 // d1.val[1]: 05 15 25 35 45 55 65 75
709 // d2.val[0]: 02 12 22 32 42 52 62 72
710 // d2.val[1]: 06 16 26 36 46 56 66 76
711 // d3.val[0]: 03 13 23 33 43 53 63 73
712 // d3.val[1]: 07 17 27 37 47 57 67 77
713
714 const uint16x8x2_t d0 = aom_vtrnq_u64_to_u16(c0.val[0], c2.val[0]);
715 const uint16x8x2_t d1 = aom_vtrnq_u64_to_u16(c1.val[0], c3.val[0]);
716 const uint16x8x2_t d2 = aom_vtrnq_u64_to_u16(c0.val[1], c2.val[1]);
717 const uint16x8x2_t d3 = aom_vtrnq_u64_to_u16(c1.val[1], c3.val[1]);
718
719 *a0 = d0.val[0];
720 *a1 = d1.val[0];
721 *a2 = d2.val[0];
722 *a3 = d3.val[0];
723 *a4 = d0.val[1];
724 *a5 = d1.val[1];
725 *a6 = d2.val[1];
726 *a7 = d3.val[1];
727 }
728
aom_vtrnq_s64_to_s16(int32x4_t a0,int32x4_t a1)729 static INLINE int16x8x2_t aom_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
730 int16x8x2_t b0;
731 #if AOM_ARCH_AARCH64
732 b0.val[0] = vreinterpretq_s16_s64(
733 vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
734 b0.val[1] = vreinterpretq_s16_s64(
735 vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
736 #else
737 b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
738 vreinterpret_s16_s32(vget_low_s32(a1)));
739 b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
740 vreinterpret_s16_s32(vget_high_s32(a1)));
741 #endif
742 return b0;
743 }
744
transpose_elems_inplace_s16_8x8(int16x8_t * a0,int16x8_t * a1,int16x8_t * a2,int16x8_t * a3,int16x8_t * a4,int16x8_t * a5,int16x8_t * a6,int16x8_t * a7)745 static INLINE void transpose_elems_inplace_s16_8x8(int16x8_t *a0, int16x8_t *a1,
746 int16x8_t *a2, int16x8_t *a3,
747 int16x8_t *a4, int16x8_t *a5,
748 int16x8_t *a6,
749 int16x8_t *a7) {
750 // Swap 16 bit elements. Goes from:
751 // a0: 00 01 02 03 04 05 06 07
752 // a1: 10 11 12 13 14 15 16 17
753 // a2: 20 21 22 23 24 25 26 27
754 // a3: 30 31 32 33 34 35 36 37
755 // a4: 40 41 42 43 44 45 46 47
756 // a5: 50 51 52 53 54 55 56 57
757 // a6: 60 61 62 63 64 65 66 67
758 // a7: 70 71 72 73 74 75 76 77
759 // to:
760 // b0.val[0]: 00 10 02 12 04 14 06 16
761 // b0.val[1]: 01 11 03 13 05 15 07 17
762 // b1.val[0]: 20 30 22 32 24 34 26 36
763 // b1.val[1]: 21 31 23 33 25 35 27 37
764 // b2.val[0]: 40 50 42 52 44 54 46 56
765 // b2.val[1]: 41 51 43 53 45 55 47 57
766 // b3.val[0]: 60 70 62 72 64 74 66 76
767 // b3.val[1]: 61 71 63 73 65 75 67 77
768
769 const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
770 const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
771 const int16x8x2_t b2 = vtrnq_s16(*a4, *a5);
772 const int16x8x2_t b3 = vtrnq_s16(*a6, *a7);
773
774 // Swap 32 bit elements resulting in:
775 // c0.val[0]: 00 10 20 30 04 14 24 34
776 // c0.val[1]: 02 12 22 32 06 16 26 36
777 // c1.val[0]: 01 11 21 31 05 15 25 35
778 // c1.val[1]: 03 13 23 33 07 17 27 37
779 // c2.val[0]: 40 50 60 70 44 54 64 74
780 // c2.val[1]: 42 52 62 72 46 56 66 76
781 // c3.val[0]: 41 51 61 71 45 55 65 75
782 // c3.val[1]: 43 53 63 73 47 57 67 77
783
784 const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
785 vreinterpretq_s32_s16(b1.val[0]));
786 const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
787 vreinterpretq_s32_s16(b1.val[1]));
788 const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
789 vreinterpretq_s32_s16(b3.val[0]));
790 const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
791 vreinterpretq_s32_s16(b3.val[1]));
792
793 // Swap 64 bit elements resulting in:
794 // d0.val[0]: 00 10 20 30 40 50 60 70
795 // d0.val[1]: 04 14 24 34 44 54 64 74
796 // d1.val[0]: 01 11 21 31 41 51 61 71
797 // d1.val[1]: 05 15 25 35 45 55 65 75
798 // d2.val[0]: 02 12 22 32 42 52 62 72
799 // d2.val[1]: 06 16 26 36 46 56 66 76
800 // d3.val[0]: 03 13 23 33 43 53 63 73
801 // d3.val[1]: 07 17 27 37 47 57 67 77
802
803 const int16x8x2_t d0 = aom_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
804 const int16x8x2_t d1 = aom_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
805 const int16x8x2_t d2 = aom_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
806 const int16x8x2_t d3 = aom_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
807
808 *a0 = d0.val[0];
809 *a1 = d1.val[0];
810 *a2 = d2.val[0];
811 *a3 = d3.val[0];
812 *a4 = d0.val[1];
813 *a5 = d1.val[1];
814 *a6 = d2.val[1];
815 *a7 = d3.val[1];
816 }
817
transpose_arrays_s16_8x8(const int16x8_t * a,int16x8_t * out)818 static INLINE void transpose_arrays_s16_8x8(const int16x8_t *a,
819 int16x8_t *out) {
820 // Swap 16 bit elements. Goes from:
821 // a0: 00 01 02 03 04 05 06 07
822 // a1: 10 11 12 13 14 15 16 17
823 // a2: 20 21 22 23 24 25 26 27
824 // a3: 30 31 32 33 34 35 36 37
825 // a4: 40 41 42 43 44 45 46 47
826 // a5: 50 51 52 53 54 55 56 57
827 // a6: 60 61 62 63 64 65 66 67
828 // a7: 70 71 72 73 74 75 76 77
829 // to:
830 // b0.val[0]: 00 10 02 12 04 14 06 16
831 // b0.val[1]: 01 11 03 13 05 15 07 17
832 // b1.val[0]: 20 30 22 32 24 34 26 36
833 // b1.val[1]: 21 31 23 33 25 35 27 37
834 // b2.val[0]: 40 50 42 52 44 54 46 56
835 // b2.val[1]: 41 51 43 53 45 55 47 57
836 // b3.val[0]: 60 70 62 72 64 74 66 76
837 // b3.val[1]: 61 71 63 73 65 75 67 77
838
839 const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]);
840 const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]);
841 const int16x8x2_t b2 = vtrnq_s16(a[4], a[5]);
842 const int16x8x2_t b3 = vtrnq_s16(a[6], a[7]);
843
844 // Swap 32 bit elements resulting in:
845 // c0.val[0]: 00 10 20 30 04 14 24 34
846 // c0.val[1]: 02 12 22 32 06 16 26 36
847 // c1.val[0]: 01 11 21 31 05 15 25 35
848 // c1.val[1]: 03 13 23 33 07 17 27 37
849 // c2.val[0]: 40 50 60 70 44 54 64 74
850 // c2.val[1]: 42 52 62 72 46 56 66 76
851 // c3.val[0]: 41 51 61 71 45 55 65 75
852 // c3.val[1]: 43 53 63 73 47 57 67 77
853
854 const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
855 vreinterpretq_s32_s16(b1.val[0]));
856 const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
857 vreinterpretq_s32_s16(b1.val[1]));
858 const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
859 vreinterpretq_s32_s16(b3.val[0]));
860 const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
861 vreinterpretq_s32_s16(b3.val[1]));
862
863 // Swap 64 bit elements resulting in:
864 // d0.val[0]: 00 10 20 30 40 50 60 70
865 // d0.val[1]: 04 14 24 34 44 54 64 74
866 // d1.val[0]: 01 11 21 31 41 51 61 71
867 // d1.val[1]: 05 15 25 35 45 55 65 75
868 // d2.val[0]: 02 12 22 32 42 52 62 72
869 // d2.val[1]: 06 16 26 36 46 56 66 76
870 // d3.val[0]: 03 13 23 33 43 53 63 73
871 // d3.val[1]: 07 17 27 37 47 57 67 77
872
873 const int16x8x2_t d0 = aom_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
874 const int16x8x2_t d1 = aom_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
875 const int16x8x2_t d2 = aom_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
876 const int16x8x2_t d3 = aom_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
877
878 out[0] = d0.val[0];
879 out[1] = d1.val[0];
880 out[2] = d2.val[0];
881 out[3] = d3.val[0];
882 out[4] = d0.val[1];
883 out[5] = d1.val[1];
884 out[6] = d2.val[1];
885 out[7] = d3.val[1];
886 }
887
transpose_elems_inplace_u16_4x4(uint16x4_t * a0,uint16x4_t * a1,uint16x4_t * a2,uint16x4_t * a3)888 static INLINE void transpose_elems_inplace_u16_4x4(uint16x4_t *a0,
889 uint16x4_t *a1,
890 uint16x4_t *a2,
891 uint16x4_t *a3) {
892 // Swap 16 bit elements. Goes from:
893 // a0: 00 01 02 03
894 // a1: 10 11 12 13
895 // a2: 20 21 22 23
896 // a3: 30 31 32 33
897 // to:
898 // b0.val[0]: 00 10 02 12
899 // b0.val[1]: 01 11 03 13
900 // b1.val[0]: 20 30 22 32
901 // b1.val[1]: 21 31 23 33
902
903 const uint16x4x2_t b0 = vtrn_u16(*a0, *a1);
904 const uint16x4x2_t b1 = vtrn_u16(*a2, *a3);
905
906 // Swap 32 bit elements resulting in:
907 // c0.val[0]: 00 10 20 30
908 // c0.val[1]: 02 12 22 32
909 // c1.val[0]: 01 11 21 31
910 // c1.val[1]: 03 13 23 33
911
912 const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
913 vreinterpret_u32_u16(b1.val[0]));
914 const uint32x2x2_t c1 = vtrn_u32(vreinterpret_u32_u16(b0.val[1]),
915 vreinterpret_u32_u16(b1.val[1]));
916
917 *a0 = vreinterpret_u16_u32(c0.val[0]);
918 *a1 = vreinterpret_u16_u32(c1.val[0]);
919 *a2 = vreinterpret_u16_u32(c0.val[1]);
920 *a3 = vreinterpret_u16_u32(c1.val[1]);
921 }
922
transpose_elems_inplace_s16_4x4(int16x4_t * a0,int16x4_t * a1,int16x4_t * a2,int16x4_t * a3)923 static INLINE void transpose_elems_inplace_s16_4x4(int16x4_t *a0, int16x4_t *a1,
924 int16x4_t *a2,
925 int16x4_t *a3) {
926 // Swap 16 bit elements. Goes from:
927 // a0: 00 01 02 03
928 // a1: 10 11 12 13
929 // a2: 20 21 22 23
930 // a3: 30 31 32 33
931 // to:
932 // b0.val[0]: 00 10 02 12
933 // b0.val[1]: 01 11 03 13
934 // b1.val[0]: 20 30 22 32
935 // b1.val[1]: 21 31 23 33
936
937 const int16x4x2_t b0 = vtrn_s16(*a0, *a1);
938 const int16x4x2_t b1 = vtrn_s16(*a2, *a3);
939
940 // Swap 32 bit elements resulting in:
941 // c0.val[0]: 00 10 20 30
942 // c0.val[1]: 02 12 22 32
943 // c1.val[0]: 01 11 21 31
944 // c1.val[1]: 03 13 23 33
945
946 const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
947 vreinterpret_s32_s16(b1.val[0]));
948 const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
949 vreinterpret_s32_s16(b1.val[1]));
950
951 *a0 = vreinterpret_s16_s32(c0.val[0]);
952 *a1 = vreinterpret_s16_s32(c1.val[0]);
953 *a2 = vreinterpret_s16_s32(c0.val[1]);
954 *a3 = vreinterpret_s16_s32(c1.val[1]);
955 }
956
aom_vtrnq_s64_to_s32(int32x4_t a0,int32x4_t a1)957 static INLINE int32x4x2_t aom_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
958 int32x4x2_t b0;
959 #if AOM_ARCH_AARCH64
960 b0.val[0] = vreinterpretq_s32_s64(
961 vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
962 b0.val[1] = vreinterpretq_s32_s64(
963 vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
964 #else
965 b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
966 b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
967 #endif
968 return b0;
969 }
970
transpose_elems_s32_4x4(const int32x4_t a0,const int32x4_t a1,const int32x4_t a2,const int32x4_t a3,int32x4_t * o0,int32x4_t * o1,int32x4_t * o2,int32x4_t * o3)971 static INLINE void transpose_elems_s32_4x4(const int32x4_t a0,
972 const int32x4_t a1,
973 const int32x4_t a2,
974 const int32x4_t a3, int32x4_t *o0,
975 int32x4_t *o1, int32x4_t *o2,
976 int32x4_t *o3) {
977 // Swap 32 bit elements. Goes from:
978 // a0: 00 01 02 03
979 // a1: 10 11 12 13
980 // a2: 20 21 22 23
981 // a3: 30 31 32 33
982 // to:
983 // b0.val[0]: 00 10 02 12
984 // b0.val[1]: 01 11 03 13
985 // b1.val[0]: 20 30 22 32
986 // b1.val[1]: 21 31 23 33
987
988 const int32x4x2_t b0 = vtrnq_s32(a0, a1);
989 const int32x4x2_t b1 = vtrnq_s32(a2, a3);
990
991 // Swap 64 bit elements resulting in:
992 // c0.val[0]: 00 10 20 30
993 // c0.val[1]: 02 12 22 32
994 // c1.val[0]: 01 11 21 31
995 // c1.val[1]: 03 13 23 33
996
997 const int32x4x2_t c0 = aom_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
998 const int32x4x2_t c1 = aom_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
999
1000 *o0 = c0.val[0];
1001 *o1 = c1.val[0];
1002 *o2 = c0.val[1];
1003 *o3 = c1.val[1];
1004 }
1005
transpose_elems_inplace_s32_4x4(int32x4_t * a0,int32x4_t * a1,int32x4_t * a2,int32x4_t * a3)1006 static INLINE void transpose_elems_inplace_s32_4x4(int32x4_t *a0, int32x4_t *a1,
1007 int32x4_t *a2,
1008 int32x4_t *a3) {
1009 transpose_elems_s32_4x4(*a0, *a1, *a2, *a3, a0, a1, a2, a3);
1010 }
1011
transpose_arrays_s32_4x4(const int32x4_t * in,int32x4_t * out)1012 static INLINE void transpose_arrays_s32_4x4(const int32x4_t *in,
1013 int32x4_t *out) {
1014 transpose_elems_s32_4x4(in[0], in[1], in[2], in[3], &out[0], &out[1], &out[2],
1015 &out[3]);
1016 }
1017
transpose_arrays_s32_4nx4n(const int32x4_t * in,int32x4_t * out,const int width,const int height)1018 static AOM_FORCE_INLINE void transpose_arrays_s32_4nx4n(const int32x4_t *in,
1019 int32x4_t *out,
1020 const int width,
1021 const int height) {
1022 const int h = height >> 2;
1023 const int w = width >> 2;
1024 for (int j = 0; j < w; j++) {
1025 for (int i = 0; i < h; i++) {
1026 transpose_arrays_s32_4x4(in + j * height + i * 4,
1027 out + i * width + j * 4);
1028 }
1029 }
1030 }
1031
1032 #define TRANSPOSE_ARRAYS_S32_WXH_NEON(w, h) \
1033 static AOM_FORCE_INLINE void transpose_arrays_s32_##w##x##h( \
1034 const int32x4_t *in, int32x4_t *out) { \
1035 transpose_arrays_s32_4nx4n(in, out, w, h); \
1036 }
1037
1038 TRANSPOSE_ARRAYS_S32_WXH_NEON(4, 8)
1039 TRANSPOSE_ARRAYS_S32_WXH_NEON(4, 16)
1040 TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 4)
1041 TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 8)
1042 TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 16)
1043 TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 32)
1044 TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 8)
1045 TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 16)
1046 TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 32)
1047 TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 64)
1048 TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 8)
1049 TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 16)
1050 TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 32)
1051 TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 64)
1052 TRANSPOSE_ARRAYS_S32_WXH_NEON(64, 16)
1053 TRANSPOSE_ARRAYS_S32_WXH_NEON(64, 32)
1054
1055 #undef TRANSPOSE_ARRAYS_S32_WXH_NEON
1056
aom_vtrn1q_s64(int64x2_t a,int64x2_t b)1057 static INLINE int64x2_t aom_vtrn1q_s64(int64x2_t a, int64x2_t b) {
1058 #if AOM_ARCH_AARCH64
1059 return vtrn1q_s64(a, b);
1060 #else
1061 return vcombine_s64(vget_low_s64(a), vget_low_s64(b));
1062 #endif
1063 }
1064
aom_vtrn2q_s64(int64x2_t a,int64x2_t b)1065 static INLINE int64x2_t aom_vtrn2q_s64(int64x2_t a, int64x2_t b) {
1066 #if AOM_ARCH_AARCH64
1067 return vtrn2q_s64(a, b);
1068 #else
1069 return vcombine_s64(vget_high_s64(a), vget_high_s64(b));
1070 #endif
1071 }
1072
transpose_elems_s32_4x8(int32x4_t a0,int32x4_t a1,int32x4_t a2,int32x4_t a3,int32x4_t a4,int32x4_t a5,int32x4_t a6,int32x4_t a7,int32x4x2_t * o0,int32x4x2_t * o1,int32x4x2_t * o2,int32x4x2_t * o3)1073 static INLINE void transpose_elems_s32_4x8(int32x4_t a0, int32x4_t a1,
1074 int32x4_t a2, int32x4_t a3,
1075 int32x4_t a4, int32x4_t a5,
1076 int32x4_t a6, int32x4_t a7,
1077 int32x4x2_t *o0, int32x4x2_t *o1,
1078 int32x4x2_t *o2, int32x4x2_t *o3) {
1079 // Perform a 4 x 8 matrix transpose by building on top of the existing 4 x 4
1080 // matrix transpose implementation:
1081 // [ A ]^T => [ A^T B^T ]
1082 // [ B ]
1083
1084 transpose_elems_inplace_s32_4x4(&a0, &a1, &a2, &a3); // A^T
1085 transpose_elems_inplace_s32_4x4(&a4, &a5, &a6, &a7); // B^T
1086
1087 o0->val[0] = a0;
1088 o1->val[0] = a1;
1089 o2->val[0] = a2;
1090 o3->val[0] = a3;
1091
1092 o0->val[1] = a4;
1093 o1->val[1] = a5;
1094 o2->val[1] = a6;
1095 o3->val[1] = a7;
1096 }
1097
transpose_elems_inplace_s32_8x8(int32x4x2_t * a0,int32x4x2_t * a1,int32x4x2_t * a2,int32x4x2_t * a3,int32x4x2_t * a4,int32x4x2_t * a5,int32x4x2_t * a6,int32x4x2_t * a7)1098 static INLINE void transpose_elems_inplace_s32_8x8(
1099 int32x4x2_t *a0, int32x4x2_t *a1, int32x4x2_t *a2, int32x4x2_t *a3,
1100 int32x4x2_t *a4, int32x4x2_t *a5, int32x4x2_t *a6, int32x4x2_t *a7) {
1101 // Perform an 8 x 8 matrix transpose by building on top of the existing 4 x 4
1102 // matrix transpose implementation:
1103 // [ A B ]^T => [ A^T C^T ]
1104 // [ C D ] [ B^T D^T ]
1105
1106 int32x4_t q0_v1 = a0->val[0];
1107 int32x4_t q0_v2 = a1->val[0];
1108 int32x4_t q0_v3 = a2->val[0];
1109 int32x4_t q0_v4 = a3->val[0];
1110
1111 int32x4_t q1_v1 = a0->val[1];
1112 int32x4_t q1_v2 = a1->val[1];
1113 int32x4_t q1_v3 = a2->val[1];
1114 int32x4_t q1_v4 = a3->val[1];
1115
1116 int32x4_t q2_v1 = a4->val[0];
1117 int32x4_t q2_v2 = a5->val[0];
1118 int32x4_t q2_v3 = a6->val[0];
1119 int32x4_t q2_v4 = a7->val[0];
1120
1121 int32x4_t q3_v1 = a4->val[1];
1122 int32x4_t q3_v2 = a5->val[1];
1123 int32x4_t q3_v3 = a6->val[1];
1124 int32x4_t q3_v4 = a7->val[1];
1125
1126 transpose_elems_inplace_s32_4x4(&q0_v1, &q0_v2, &q0_v3, &q0_v4); // A^T
1127 transpose_elems_inplace_s32_4x4(&q1_v1, &q1_v2, &q1_v3, &q1_v4); // B^T
1128 transpose_elems_inplace_s32_4x4(&q2_v1, &q2_v2, &q2_v3, &q2_v4); // C^T
1129 transpose_elems_inplace_s32_4x4(&q3_v1, &q3_v2, &q3_v3, &q3_v4); // D^T
1130
1131 a0->val[0] = q0_v1;
1132 a1->val[0] = q0_v2;
1133 a2->val[0] = q0_v3;
1134 a3->val[0] = q0_v4;
1135
1136 a0->val[1] = q2_v1;
1137 a1->val[1] = q2_v2;
1138 a2->val[1] = q2_v3;
1139 a3->val[1] = q2_v4;
1140
1141 a4->val[0] = q1_v1;
1142 a5->val[0] = q1_v2;
1143 a6->val[0] = q1_v3;
1144 a7->val[0] = q1_v4;
1145
1146 a4->val[1] = q3_v1;
1147 a5->val[1] = q3_v2;
1148 a6->val[1] = q3_v3;
1149 a7->val[1] = q3_v4;
1150 }
1151
transpose_arrays_s16_4x4(const int16x4_t * const in,int16x4_t * const out)1152 static INLINE void transpose_arrays_s16_4x4(const int16x4_t *const in,
1153 int16x4_t *const out) {
1154 int16x4_t a0 = in[0];
1155 int16x4_t a1 = in[1];
1156 int16x4_t a2 = in[2];
1157 int16x4_t a3 = in[3];
1158
1159 transpose_elems_inplace_s16_4x4(&a0, &a1, &a2, &a3);
1160
1161 out[0] = a0;
1162 out[1] = a1;
1163 out[2] = a2;
1164 out[3] = a3;
1165 }
1166
transpose_arrays_s16_4x8(const int16x4_t * const in,int16x8_t * const out)1167 static INLINE void transpose_arrays_s16_4x8(const int16x4_t *const in,
1168 int16x8_t *const out) {
1169 #if AOM_ARCH_AARCH64
1170 const int16x8_t a0 = vzip1q_s16(vcombine_s16(in[0], vdup_n_s16(0)),
1171 vcombine_s16(in[1], vdup_n_s16(0)));
1172 const int16x8_t a1 = vzip1q_s16(vcombine_s16(in[2], vdup_n_s16(0)),
1173 vcombine_s16(in[3], vdup_n_s16(0)));
1174 const int16x8_t a2 = vzip1q_s16(vcombine_s16(in[4], vdup_n_s16(0)),
1175 vcombine_s16(in[5], vdup_n_s16(0)));
1176 const int16x8_t a3 = vzip1q_s16(vcombine_s16(in[6], vdup_n_s16(0)),
1177 vcombine_s16(in[7], vdup_n_s16(0)));
1178 #else
1179 int16x4x2_t temp;
1180 temp = vzip_s16(in[0], in[1]);
1181 const int16x8_t a0 = vcombine_s16(temp.val[0], temp.val[1]);
1182 temp = vzip_s16(in[2], in[3]);
1183 const int16x8_t a1 = vcombine_s16(temp.val[0], temp.val[1]);
1184 temp = vzip_s16(in[4], in[5]);
1185 const int16x8_t a2 = vcombine_s16(temp.val[0], temp.val[1]);
1186 temp = vzip_s16(in[6], in[7]);
1187 const int16x8_t a3 = vcombine_s16(temp.val[0], temp.val[1]);
1188 #endif
1189
1190 const int32x4x2_t b02 =
1191 vzipq_s32(vreinterpretq_s32_s16(a0), vreinterpretq_s32_s16(a1));
1192 const int32x4x2_t b13 =
1193 vzipq_s32(vreinterpretq_s32_s16(a2), vreinterpretq_s32_s16(a3));
1194
1195 #if AOM_ARCH_AARCH64
1196 out[0] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[0]),
1197 vreinterpretq_s64_s32(b13.val[0])));
1198 out[1] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[0]),
1199 vreinterpretq_s64_s32(b13.val[0])));
1200 out[2] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[1]),
1201 vreinterpretq_s64_s32(b13.val[1])));
1202 out[3] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[1]),
1203 vreinterpretq_s64_s32(b13.val[1])));
1204 #else
1205 out[0] = vreinterpretq_s16_s32(
1206 vextq_s32(vextq_s32(b02.val[0], b02.val[0], 2), b13.val[0], 2));
1207 out[2] = vreinterpretq_s16_s32(
1208 vextq_s32(vextq_s32(b02.val[1], b02.val[1], 2), b13.val[1], 2));
1209 out[1] = vreinterpretq_s16_s32(
1210 vextq_s32(b02.val[0], vextq_s32(b13.val[0], b13.val[0], 2), 2));
1211 out[3] = vreinterpretq_s16_s32(
1212 vextq_s32(b02.val[1], vextq_s32(b13.val[1], b13.val[1], 2), 2));
1213 #endif
1214 }
1215
transpose_arrays_s16_8x4(const int16x8_t * const in,int16x4_t * const out)1216 static INLINE void transpose_arrays_s16_8x4(const int16x8_t *const in,
1217 int16x4_t *const out) {
1218 // Swap 16 bit elements. Goes from:
1219 // in[0]: 00 01 02 03 04 05 06 07
1220 // in[1]: 10 11 12 13 14 15 16 17
1221 // in[2]: 20 21 22 23 24 25 26 27
1222 // in[3]: 30 31 32 33 34 35 36 37
1223 // to:
1224 // b0.val[0]: 00 10 02 12 04 14 06 16
1225 // b0.val[1]: 01 11 03 13 05 15 07 17
1226 // b1.val[0]: 20 30 22 32 24 34 26 36
1227 // b1.val[1]: 21 31 23 33 25 35 27 37
1228
1229 const int16x8x2_t b0 = vtrnq_s16(in[0], in[1]);
1230 const int16x8x2_t b1 = vtrnq_s16(in[2], in[3]);
1231
1232 // Swap 32 bit elements resulting in:
1233 // c0.val[0]: 00 10 20 30 04 14 24 34
1234 // c0.val[1]: 02 12 22 32 06 16 26 36
1235 // c1.val[0]: 01 11 21 31 05 15 25 35
1236 // c1.val[1]: 03 13 23 33 07 17 27 37
1237
1238 const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_s16(b0.val[0]),
1239 vreinterpretq_u32_s16(b1.val[0]));
1240 const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_s16(b0.val[1]),
1241 vreinterpretq_u32_s16(b1.val[1]));
1242
1243 // Unpack 64 bit elements resulting in:
1244 // out[0]: 00 10 20 30
1245 // out[1]: 01 11 21 31
1246 // out[2]: 02 12 22 32
1247 // out[3]: 03 13 23 33
1248 // out[4]: 04 14 24 34
1249 // out[5]: 05 15 25 35
1250 // out[6]: 06 16 26 36
1251 // out[7]: 07 17 27 37
1252
1253 out[0] = vget_low_s16(vreinterpretq_s16_u32(c0.val[0]));
1254 out[1] = vget_low_s16(vreinterpretq_s16_u32(c1.val[0]));
1255 out[2] = vget_low_s16(vreinterpretq_s16_u32(c0.val[1]));
1256 out[3] = vget_low_s16(vreinterpretq_s16_u32(c1.val[1]));
1257 out[4] = vget_high_s16(vreinterpretq_s16_u32(c0.val[0]));
1258 out[5] = vget_high_s16(vreinterpretq_s16_u32(c1.val[0]));
1259 out[6] = vget_high_s16(vreinterpretq_s16_u32(c0.val[1]));
1260 out[7] = vget_high_s16(vreinterpretq_s16_u32(c1.val[1]));
1261 }
1262
1263 #endif // AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
1264