1 /*
2 * Copyright 2019 The libgav1 Authors
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #ifndef LIBGAV1_SRC_DSP_ARM_COMMON_NEON_H_
18 #define LIBGAV1_SRC_DSP_ARM_COMMON_NEON_H_
19
20 #include "src/utils/cpu.h"
21
22 #if LIBGAV1_ENABLE_NEON
23
24 #include <arm_neon.h>
25
26 #include <algorithm>
27 #include <cstddef>
28 #include <cstdint>
29 #include <cstring>
30
31 #include "src/utils/compiler_attributes.h"
32
33 #if 0
34 #include <cstdio>
35 #include <string>
36
37 constexpr bool kEnablePrintRegs = true;
38
39 union DebugRegister {
40 int8_t i8[8];
41 int16_t i16[4];
42 int32_t i32[2];
43 uint8_t u8[8];
44 uint16_t u16[4];
45 uint32_t u32[2];
46 };
47
48 union DebugRegisterQ {
49 int8_t i8[16];
50 int16_t i16[8];
51 int32_t i32[4];
52 uint8_t u8[16];
53 uint16_t u16[8];
54 uint32_t u32[4];
55 };
56
57 // Quite useful macro for debugging. Left here for convenience.
58 inline void PrintVect(const DebugRegister r, const char* const name, int size) {
59 int n;
60 if (kEnablePrintRegs) {
61 fprintf(stderr, "%s\t: ", name);
62 if (size == 8) {
63 for (n = 0; n < 8; ++n) fprintf(stderr, "%.2x ", r.u8[n]);
64 } else if (size == 16) {
65 for (n = 0; n < 4; ++n) fprintf(stderr, "%.4x ", r.u16[n]);
66 } else if (size == 32) {
67 for (n = 0; n < 2; ++n) fprintf(stderr, "%.8x ", r.u32[n]);
68 }
69 fprintf(stderr, "\n");
70 }
71 }
72
73 // Debugging macro for 128-bit types.
74 inline void PrintVectQ(const DebugRegisterQ r, const char* const name,
75 int size) {
76 int n;
77 if (kEnablePrintRegs) {
78 fprintf(stderr, "%s\t: ", name);
79 if (size == 8) {
80 for (n = 0; n < 16; ++n) fprintf(stderr, "%.2x ", r.u8[n]);
81 } else if (size == 16) {
82 for (n = 0; n < 8; ++n) fprintf(stderr, "%.4x ", r.u16[n]);
83 } else if (size == 32) {
84 for (n = 0; n < 4; ++n) fprintf(stderr, "%.8x ", r.u32[n]);
85 }
86 fprintf(stderr, "\n");
87 }
88 }
89
90 inline void PrintReg(const int32x4x2_t val, const std::string& name) {
91 DebugRegisterQ r;
92 vst1q_s32(r.i32, val.val[0]);
93 const std::string name0 = name + std::string(".val[0]");
94 PrintVectQ(r, name0.c_str(), 32);
95 vst1q_s32(r.i32, val.val[1]);
96 const std::string name1 = name + std::string(".val[1]");
97 PrintVectQ(r, name1.c_str(), 32);
98 }
99
100 inline void PrintReg(const uint32x4_t val, const char* name) {
101 DebugRegisterQ r;
102 vst1q_u32(r.u32, val);
103 PrintVectQ(r, name, 32);
104 }
105
106 inline void PrintReg(const uint32x2_t val, const char* name) {
107 DebugRegister r;
108 vst1_u32(r.u32, val);
109 PrintVect(r, name, 32);
110 }
111
112 inline void PrintReg(const uint16x8_t val, const char* name) {
113 DebugRegisterQ r;
114 vst1q_u16(r.u16, val);
115 PrintVectQ(r, name, 16);
116 }
117
118 inline void PrintReg(const uint16x4_t val, const char* name) {
119 DebugRegister r;
120 vst1_u16(r.u16, val);
121 PrintVect(r, name, 16);
122 }
123
124 inline void PrintReg(const uint8x16_t val, const char* name) {
125 DebugRegisterQ r;
126 vst1q_u8(r.u8, val);
127 PrintVectQ(r, name, 8);
128 }
129
130 inline void PrintReg(const uint8x8_t val, const char* name) {
131 DebugRegister r;
132 vst1_u8(r.u8, val);
133 PrintVect(r, name, 8);
134 }
135
136 inline void PrintReg(const int32x4_t val, const char* name) {
137 DebugRegisterQ r;
138 vst1q_s32(r.i32, val);
139 PrintVectQ(r, name, 32);
140 }
141
142 inline void PrintReg(const int32x2_t val, const char* name) {
143 DebugRegister r;
144 vst1_s32(r.i32, val);
145 PrintVect(r, name, 32);
146 }
147
148 inline void PrintReg(const int16x8_t val, const char* name) {
149 DebugRegisterQ r;
150 vst1q_s16(r.i16, val);
151 PrintVectQ(r, name, 16);
152 }
153
154 inline void PrintReg(const int16x4_t val, const char* name) {
155 DebugRegister r;
156 vst1_s16(r.i16, val);
157 PrintVect(r, name, 16);
158 }
159
160 inline void PrintReg(const int8x16_t val, const char* name) {
161 DebugRegisterQ r;
162 vst1q_s8(r.i8, val);
163 PrintVectQ(r, name, 8);
164 }
165
166 inline void PrintReg(const int8x8_t val, const char* name) {
167 DebugRegister r;
168 vst1_s8(r.i8, val);
169 PrintVect(r, name, 8);
170 }
171
172 // Print an individual (non-vector) value in decimal format.
173 inline void PrintReg(const int x, const char* name) {
174 if (kEnablePrintRegs) {
175 fprintf(stderr, "%s: %d\n", name, x);
176 }
177 }
178
179 // Print an individual (non-vector) value in hexadecimal format.
180 inline void PrintHex(const int x, const char* name) {
181 if (kEnablePrintRegs) {
182 fprintf(stderr, "%s: %x\n", name, x);
183 }
184 }
185
186 #define PR(x) PrintReg(x, #x)
187 #define PD(x) PrintReg(x, #x)
188 #define PX(x) PrintHex(x, #x)
189
190 #if LIBGAV1_MSAN
191 inline void PrintShadow(const void* r, const char* const name,
192 const size_t size) {
193 if (kEnablePrintRegs) {
194 fprintf(stderr, "Shadow for %s:\n", name);
195 __msan_print_shadow(r, size);
196 }
197 }
198 #define PS(var, N) PrintShadow(var, #var, N)
199
200 #endif // LIBGAV1_MSAN
201
202 #endif // 0
203
204 namespace libgav1 {
205 namespace dsp {
206
207 //------------------------------------------------------------------------------
208 // Load functions.
209
210 // Load 2 uint8_t values into lanes 0 and 1. Zeros the register before loading
211 // the values. Use caution when using this in loops because it will re-zero the
212 // register before loading on every iteration.
Load2(const void * const buf)213 inline uint8x8_t Load2(const void* const buf) {
214 const uint16x4_t zero = vdup_n_u16(0);
215 uint16_t temp;
216 memcpy(&temp, buf, 2);
217 return vreinterpret_u8_u16(vld1_lane_u16(&temp, zero, 0));
218 }
219
220 // Load 2 uint8_t values into |lane| * 2 and |lane| * 2 + 1.
221 template <int lane>
Load2(const void * const buf,uint8x8_t val)222 inline uint8x8_t Load2(const void* const buf, uint8x8_t val) {
223 uint16_t temp;
224 memcpy(&temp, buf, 2);
225 return vreinterpret_u8_u16(
226 vld1_lane_u16(&temp, vreinterpret_u16_u8(val), lane));
227 }
228
229 template <int lane>
Load2(const void * const buf,uint16x4_t val)230 inline uint16x4_t Load2(const void* const buf, uint16x4_t val) {
231 uint32_t temp;
232 memcpy(&temp, buf, 4);
233 return vreinterpret_u16_u32(
234 vld1_lane_u32(&temp, vreinterpret_u32_u16(val), lane));
235 }
236
237 // Load 4 uint8_t values into the low half of a uint8x8_t register. Zeros the
238 // register before loading the values. Use caution when using this in loops
239 // because it will re-zero the register before loading on every iteration.
Load4(const void * const buf)240 inline uint8x8_t Load4(const void* const buf) {
241 const uint32x2_t zero = vdup_n_u32(0);
242 uint32_t temp;
243 memcpy(&temp, buf, 4);
244 return vreinterpret_u8_u32(vld1_lane_u32(&temp, zero, 0));
245 }
246
247 // Load 4 uint8_t values into 4 lanes staring with |lane| * 4.
248 template <int lane>
Load4(const void * const buf,uint8x8_t val)249 inline uint8x8_t Load4(const void* const buf, uint8x8_t val) {
250 uint32_t temp;
251 memcpy(&temp, buf, 4);
252 return vreinterpret_u8_u32(
253 vld1_lane_u32(&temp, vreinterpret_u32_u8(val), lane));
254 }
255
256 // Convenience functions for 16-bit loads from a uint8_t* source.
Load4U16(const void * const buf)257 inline uint16x4_t Load4U16(const void* const buf) {
258 return vld1_u16(static_cast<const uint16_t*>(buf));
259 }
260
Load8U16(const void * const buf)261 inline uint16x8_t Load8U16(const void* const buf) {
262 return vld1q_u16(static_cast<const uint16_t*>(buf));
263 }
264
265 //------------------------------------------------------------------------------
266 // Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning.
267
MaskOverreads(const uint8x8_t source,const ptrdiff_t over_read_in_bytes)268 inline uint8x8_t MaskOverreads(const uint8x8_t source,
269 const ptrdiff_t over_read_in_bytes) {
270 uint8x8_t dst = source;
271 #if LIBGAV1_MSAN
272 if (over_read_in_bytes > 0) {
273 uint8x8_t mask = vdup_n_u8(0);
274 uint8x8_t valid_element_mask = vdup_n_u8(-1);
275 const int valid_bytes =
276 std::min(8, 8 - static_cast<int>(over_read_in_bytes));
277 for (int i = 0; i < valid_bytes; ++i) {
278 // Feed ff bytes into |mask| one at a time.
279 mask = vext_u8(valid_element_mask, mask, 7);
280 }
281 dst = vand_u8(dst, mask);
282 }
283 #else
284 static_cast<void>(over_read_in_bytes);
285 #endif
286 return dst;
287 }
288
MaskOverreadsQ(const uint8x16_t source,const ptrdiff_t over_read_in_bytes)289 inline uint8x16_t MaskOverreadsQ(const uint8x16_t source,
290 const ptrdiff_t over_read_in_bytes) {
291 uint8x16_t dst = source;
292 #if LIBGAV1_MSAN
293 if (over_read_in_bytes > 0) {
294 uint8x16_t mask = vdupq_n_u8(0);
295 uint8x16_t valid_element_mask = vdupq_n_u8(-1);
296 const int valid_bytes =
297 std::min(16, 16 - static_cast<int>(over_read_in_bytes));
298 for (int i = 0; i < valid_bytes; ++i) {
299 // Feed ff bytes into |mask| one at a time.
300 mask = vextq_u8(valid_element_mask, mask, 15);
301 }
302 dst = vandq_u8(dst, mask);
303 }
304 #else
305 static_cast<void>(over_read_in_bytes);
306 #endif
307 return dst;
308 }
309
MaskOverreadsQ(const uint16x8_t source,const ptrdiff_t over_read_in_bytes)310 inline uint16x8_t MaskOverreadsQ(const uint16x8_t source,
311 const ptrdiff_t over_read_in_bytes) {
312 return vreinterpretq_u16_u8(
313 MaskOverreadsQ(vreinterpretq_u8_u16(source), over_read_in_bytes));
314 }
315
Load1MsanU8(const uint8_t * const source,const ptrdiff_t over_read_in_bytes)316 inline uint8x8_t Load1MsanU8(const uint8_t* const source,
317 const ptrdiff_t over_read_in_bytes) {
318 return MaskOverreads(vld1_u8(source), over_read_in_bytes);
319 }
320
Load1QMsanU8(const uint8_t * const source,const ptrdiff_t over_read_in_bytes)321 inline uint8x16_t Load1QMsanU8(const uint8_t* const source,
322 const ptrdiff_t over_read_in_bytes) {
323 return MaskOverreadsQ(vld1q_u8(source), over_read_in_bytes);
324 }
325
Load1QMsanU16(const uint16_t * const source,const ptrdiff_t over_read_in_bytes)326 inline uint16x8_t Load1QMsanU16(const uint16_t* const source,
327 const ptrdiff_t over_read_in_bytes) {
328 return vreinterpretq_u16_u8(MaskOverreadsQ(
329 vreinterpretq_u8_u16(vld1q_u16(source)), over_read_in_bytes));
330 }
331
Load1QMsanU32(const uint32_t * const source,const ptrdiff_t over_read_in_bytes)332 inline uint32x4_t Load1QMsanU32(const uint32_t* const source,
333 const ptrdiff_t over_read_in_bytes) {
334 return vreinterpretq_u32_u8(MaskOverreadsQ(
335 vreinterpretq_u8_u32(vld1q_u32(source)), over_read_in_bytes));
336 }
337
338 //------------------------------------------------------------------------------
339 // Store functions.
340
341 // Propagate type information to the compiler. Without this the compiler may
342 // assume the required alignment of the type (4 bytes in the case of uint32_t)
343 // and add alignment hints to the memory access.
344 template <typename T>
ValueToMem(void * const buf,T val)345 inline void ValueToMem(void* const buf, T val) {
346 memcpy(buf, &val, sizeof(val));
347 }
348
349 // Store 4 int8_t values from the low half of an int8x8_t register.
StoreLo4(void * const buf,const int8x8_t val)350 inline void StoreLo4(void* const buf, const int8x8_t val) {
351 ValueToMem<int32_t>(buf, vget_lane_s32(vreinterpret_s32_s8(val), 0));
352 }
353
354 // Store 4 uint8_t values from the low half of a uint8x8_t register.
StoreLo4(void * const buf,const uint8x8_t val)355 inline void StoreLo4(void* const buf, const uint8x8_t val) {
356 ValueToMem<uint32_t>(buf, vget_lane_u32(vreinterpret_u32_u8(val), 0));
357 }
358
359 // Store 4 uint8_t values from the high half of a uint8x8_t register.
StoreHi4(void * const buf,const uint8x8_t val)360 inline void StoreHi4(void* const buf, const uint8x8_t val) {
361 ValueToMem<uint32_t>(buf, vget_lane_u32(vreinterpret_u32_u8(val), 1));
362 }
363
364 // Store 2 uint8_t values from |lane| * 2 and |lane| * 2 + 1 of a uint8x8_t
365 // register.
366 template <int lane>
Store2(void * const buf,const uint8x8_t val)367 inline void Store2(void* const buf, const uint8x8_t val) {
368 ValueToMem<uint16_t>(buf, vget_lane_u16(vreinterpret_u16_u8(val), lane));
369 }
370
371 // Store 2 uint16_t values from |lane| * 2 and |lane| * 2 + 1 of a uint16x8_t
372 // register.
373 template <int lane>
Store2(void * const buf,const uint16x8_t val)374 inline void Store2(void* const buf, const uint16x8_t val) {
375 ValueToMem<uint32_t>(buf, vgetq_lane_u32(vreinterpretq_u32_u16(val), lane));
376 }
377
378 // Store 2 uint16_t values from |lane| * 2 and |lane| * 2 + 1 of a uint16x4_t
379 // register.
380 template <int lane>
Store2(void * const buf,const uint16x4_t val)381 inline void Store2(void* const buf, const uint16x4_t val) {
382 ValueToMem<uint32_t>(buf, vget_lane_u32(vreinterpret_u32_u16(val), lane));
383 }
384
385 // Simplify code when caller has |buf| cast as uint8_t*.
Store4(void * const buf,const uint16x4_t val)386 inline void Store4(void* const buf, const uint16x4_t val) {
387 vst1_u16(static_cast<uint16_t*>(buf), val);
388 }
389
390 // Simplify code when caller has |buf| cast as uint8_t*.
Store8(void * const buf,const uint16x8_t val)391 inline void Store8(void* const buf, const uint16x8_t val) {
392 vst1q_u16(static_cast<uint16_t*>(buf), val);
393 }
394
Store4QMsanS16(void * const buf,const int16x8x4_t src)395 inline void Store4QMsanS16(void* const buf, const int16x8x4_t src) {
396 #if LIBGAV1_MSAN
397 // The memory shadow is incorrect for vst4q_u16, only marking the first 16
398 // bytes of the destination as initialized. To avoid missing truly
399 // uninitialized memory, check the input vectors first, before marking the
400 // whole 64 bytes initialized. If any input vector contains unused values, it
401 // should pass through MaskOverreadsQ first.
402 __msan_check_mem_is_initialized(&src.val[0], sizeof(src.val[0]));
403 __msan_check_mem_is_initialized(&src.val[1], sizeof(src.val[1]));
404 __msan_check_mem_is_initialized(&src.val[2], sizeof(src.val[2]));
405 __msan_check_mem_is_initialized(&src.val[3], sizeof(src.val[3]));
406 vst4q_s16(static_cast<int16_t*>(buf), src);
407 __msan_unpoison(buf, sizeof(int16x8x4_t));
408 #else
409 vst4q_s16(static_cast<int16_t*>(buf), src);
410 #endif // LIBGAV1_MSAN
411 }
412
413 //------------------------------------------------------------------------------
414 // Pointer helpers.
415
416 // This function adds |stride|, given as a number of bytes, to a pointer to a
417 // larger type, using native pointer arithmetic.
418 template <typename T>
AddByteStride(T * ptr,const ptrdiff_t stride)419 inline T* AddByteStride(T* ptr, const ptrdiff_t stride) {
420 return reinterpret_cast<T*>(
421 const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(ptr) + stride));
422 }
423
424 //------------------------------------------------------------------------------
425 // Multiply.
426
427 // Shim vmull_high_u16 for armv7.
VMullHighU16(const uint16x8_t a,const uint16x8_t b)428 inline uint32x4_t VMullHighU16(const uint16x8_t a, const uint16x8_t b) {
429 #if defined(__aarch64__)
430 return vmull_high_u16(a, b);
431 #else
432 return vmull_u16(vget_high_u16(a), vget_high_u16(b));
433 #endif
434 }
435
436 // Shim vmull_high_s16 for armv7.
VMullHighS16(const int16x8_t a,const int16x8_t b)437 inline int32x4_t VMullHighS16(const int16x8_t a, const int16x8_t b) {
438 #if defined(__aarch64__)
439 return vmull_high_s16(a, b);
440 #else
441 return vmull_s16(vget_high_s16(a), vget_high_s16(b));
442 #endif
443 }
444
445 // Shim vmlal_high_u16 for armv7.
VMlalHighU16(const uint32x4_t a,const uint16x8_t b,const uint16x8_t c)446 inline uint32x4_t VMlalHighU16(const uint32x4_t a, const uint16x8_t b,
447 const uint16x8_t c) {
448 #if defined(__aarch64__)
449 return vmlal_high_u16(a, b, c);
450 #else
451 return vmlal_u16(a, vget_high_u16(b), vget_high_u16(c));
452 #endif
453 }
454
455 // Shim vmlal_high_s16 for armv7.
VMlalHighS16(const int32x4_t a,const int16x8_t b,const int16x8_t c)456 inline int32x4_t VMlalHighS16(const int32x4_t a, const int16x8_t b,
457 const int16x8_t c) {
458 #if defined(__aarch64__)
459 return vmlal_high_s16(a, b, c);
460 #else
461 return vmlal_s16(a, vget_high_s16(b), vget_high_s16(c));
462 #endif
463 }
464
465 // Shim vmul_laneq_u16 for armv7.
466 template <int lane>
VMulLaneQU16(const uint16x4_t a,const uint16x8_t b)467 inline uint16x4_t VMulLaneQU16(const uint16x4_t a, const uint16x8_t b) {
468 #if defined(__aarch64__)
469 return vmul_laneq_u16(a, b, lane);
470 #else
471 if (lane < 4) return vmul_lane_u16(a, vget_low_u16(b), lane & 0x3);
472 return vmul_lane_u16(a, vget_high_u16(b), (lane - 4) & 0x3);
473 #endif
474 }
475
476 // Shim vmulq_laneq_u16 for armv7.
477 template <int lane>
VMulQLaneQU16(const uint16x8_t a,const uint16x8_t b)478 inline uint16x8_t VMulQLaneQU16(const uint16x8_t a, const uint16x8_t b) {
479 #if defined(__aarch64__)
480 return vmulq_laneq_u16(a, b, lane);
481 #else
482 if (lane < 4) return vmulq_lane_u16(a, vget_low_u16(b), lane & 0x3);
483 return vmulq_lane_u16(a, vget_high_u16(b), (lane - 4) & 0x3);
484 #endif
485 }
486
487 // Shim vmla_laneq_u16 for armv7.
488 template <int lane>
VMlaLaneQU16(const uint16x4_t a,const uint16x4_t b,const uint16x8_t c)489 inline uint16x4_t VMlaLaneQU16(const uint16x4_t a, const uint16x4_t b,
490 const uint16x8_t c) {
491 #if defined(__aarch64__)
492 return vmla_laneq_u16(a, b, c, lane);
493 #else
494 if (lane < 4) return vmla_lane_u16(a, b, vget_low_u16(c), lane & 0x3);
495 return vmla_lane_u16(a, b, vget_high_u16(c), (lane - 4) & 0x3);
496 #endif
497 }
498
499 // Shim vmlaq_laneq_u16 for armv7.
500 template <int lane>
VMlaQLaneQU16(const uint16x8_t a,const uint16x8_t b,const uint16x8_t c)501 inline uint16x8_t VMlaQLaneQU16(const uint16x8_t a, const uint16x8_t b,
502 const uint16x8_t c) {
503 #if defined(__aarch64__)
504 return vmlaq_laneq_u16(a, b, c, lane);
505 #else
506 if (lane < 4) return vmlaq_lane_u16(a, b, vget_low_u16(c), lane & 0x3);
507 return vmlaq_lane_u16(a, b, vget_high_u16(c), (lane - 4) & 0x3);
508 #endif
509 }
510
511 //------------------------------------------------------------------------------
512 // Bit manipulation.
513
514 // vshXX_n_XX() requires an immediate.
515 template <int shift>
LeftShiftVector(const uint8x8_t vector)516 inline uint8x8_t LeftShiftVector(const uint8x8_t vector) {
517 return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vector), shift));
518 }
519
520 template <int shift>
RightShiftVector(const uint8x8_t vector)521 inline uint8x8_t RightShiftVector(const uint8x8_t vector) {
522 return vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(vector), shift));
523 }
524
525 template <int shift>
RightShiftVector(const int8x8_t vector)526 inline int8x8_t RightShiftVector(const int8x8_t vector) {
527 return vreinterpret_s8_u64(vshr_n_u64(vreinterpret_u64_s8(vector), shift));
528 }
529
530 // Shim vqtbl1_u8 for armv7.
VQTbl1U8(const uint8x16_t a,const uint8x8_t index)531 inline uint8x8_t VQTbl1U8(const uint8x16_t a, const uint8x8_t index) {
532 #if defined(__aarch64__)
533 return vqtbl1_u8(a, index);
534 #else
535 const uint8x8x2_t b = {vget_low_u8(a), vget_high_u8(a)};
536 return vtbl2_u8(b, index);
537 #endif
538 }
539
540 // Shim vqtbl2_u8 for armv7.
VQTbl2U8(const uint8x16x2_t a,const uint8x8_t index)541 inline uint8x8_t VQTbl2U8(const uint8x16x2_t a, const uint8x8_t index) {
542 #if defined(__aarch64__)
543 return vqtbl2_u8(a, index);
544 #else
545 const uint8x8x4_t b = {vget_low_u8(a.val[0]), vget_high_u8(a.val[0]),
546 vget_low_u8(a.val[1]), vget_high_u8(a.val[1])};
547 return vtbl4_u8(b, index);
548 #endif
549 }
550
551 // Shim vqtbl2q_u8 for armv7.
VQTbl2QU8(const uint8x16x2_t a,const uint8x16_t index)552 inline uint8x16_t VQTbl2QU8(const uint8x16x2_t a, const uint8x16_t index) {
553 #if defined(__aarch64__)
554 return vqtbl2q_u8(a, index);
555 #else
556 return vcombine_u8(VQTbl2U8(a, vget_low_u8(index)),
557 VQTbl2U8(a, vget_high_u8(index)));
558 #endif
559 }
560
561 // Shim vqtbl3q_u8 for armv7.
VQTbl3U8(const uint8x16x3_t a,const uint8x8_t index)562 inline uint8x8_t VQTbl3U8(const uint8x16x3_t a, const uint8x8_t index) {
563 #if defined(__aarch64__)
564 return vqtbl3_u8(a, index);
565 #else
566 const uint8x8x4_t b = {vget_low_u8(a.val[0]), vget_high_u8(a.val[0]),
567 vget_low_u8(a.val[1]), vget_high_u8(a.val[1])};
568 const uint8x8x2_t c = {vget_low_u8(a.val[2]), vget_high_u8(a.val[2])};
569 const uint8x8_t index_ext = vsub_u8(index, vdup_n_u8(32));
570 const uint8x8_t partial_lookup = vtbl4_u8(b, index);
571 return vtbx2_u8(partial_lookup, c, index_ext);
572 #endif
573 }
574
575 // Shim vqtbl3q_u8 for armv7.
VQTbl3QU8(const uint8x16x3_t a,const uint8x16_t index)576 inline uint8x16_t VQTbl3QU8(const uint8x16x3_t a, const uint8x16_t index) {
577 #if defined(__aarch64__)
578 return vqtbl3q_u8(a, index);
579 #else
580 return vcombine_u8(VQTbl3U8(a, vget_low_u8(index)),
581 VQTbl3U8(a, vget_high_u8(index)));
582 #endif
583 }
584
585 // Shim vqtbl1_s8 for armv7.
VQTbl1S8(const int8x16_t a,const uint8x8_t index)586 inline int8x8_t VQTbl1S8(const int8x16_t a, const uint8x8_t index) {
587 #if defined(__aarch64__)
588 return vqtbl1_s8(a, index);
589 #else
590 const int8x8x2_t b = {vget_low_s8(a), vget_high_s8(a)};
591 return vtbl2_s8(b, vreinterpret_s8_u8(index));
592 #endif
593 }
594
595 //------------------------------------------------------------------------------
596 // Saturation helpers.
597
Clip3S16(const int16x4_t val,const int16x4_t low,const int16x4_t high)598 inline int16x4_t Clip3S16(const int16x4_t val, const int16x4_t low,
599 const int16x4_t high) {
600 return vmin_s16(vmax_s16(val, low), high);
601 }
602
Clip3S16(const int16x8_t val,const int16x8_t low,const int16x8_t high)603 inline int16x8_t Clip3S16(const int16x8_t val, const int16x8_t low,
604 const int16x8_t high) {
605 return vminq_s16(vmaxq_s16(val, low), high);
606 }
607
ConvertToUnsignedPixelU16(const int16x8_t val,int bitdepth)608 inline uint16x8_t ConvertToUnsignedPixelU16(const int16x8_t val, int bitdepth) {
609 const int16x8_t low = vdupq_n_s16(0);
610 const uint16x8_t high = vdupq_n_u16((1 << bitdepth) - 1);
611
612 return vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(val, low)), high);
613 }
614
615 //------------------------------------------------------------------------------
616 // Interleave.
617
618 // vzipN is exclusive to A64.
InterleaveLow8(const uint8x8_t a,const uint8x8_t b)619 inline uint8x8_t InterleaveLow8(const uint8x8_t a, const uint8x8_t b) {
620 #if defined(__aarch64__)
621 return vzip1_u8(a, b);
622 #else
623 // Discard |.val[1]|
624 return vzip_u8(a, b).val[0];
625 #endif
626 }
627
InterleaveLow32(const uint8x8_t a,const uint8x8_t b)628 inline uint8x8_t InterleaveLow32(const uint8x8_t a, const uint8x8_t b) {
629 #if defined(__aarch64__)
630 return vreinterpret_u8_u32(
631 vzip1_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(b)));
632 #else
633 // Discard |.val[1]|
634 return vreinterpret_u8_u32(
635 vzip_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(b)).val[0]);
636 #endif
637 }
638
InterleaveLow32(const int8x8_t a,const int8x8_t b)639 inline int8x8_t InterleaveLow32(const int8x8_t a, const int8x8_t b) {
640 #if defined(__aarch64__)
641 return vreinterpret_s8_u32(
642 vzip1_u32(vreinterpret_u32_s8(a), vreinterpret_u32_s8(b)));
643 #else
644 // Discard |.val[1]|
645 return vreinterpret_s8_u32(
646 vzip_u32(vreinterpret_u32_s8(a), vreinterpret_u32_s8(b)).val[0]);
647 #endif
648 }
649
InterleaveHigh32(const uint8x8_t a,const uint8x8_t b)650 inline uint8x8_t InterleaveHigh32(const uint8x8_t a, const uint8x8_t b) {
651 #if defined(__aarch64__)
652 return vreinterpret_u8_u32(
653 vzip2_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(b)));
654 #else
655 // Discard |.val[0]|
656 return vreinterpret_u8_u32(
657 vzip_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(b)).val[1]);
658 #endif
659 }
660
InterleaveHigh32(const int8x8_t a,const int8x8_t b)661 inline int8x8_t InterleaveHigh32(const int8x8_t a, const int8x8_t b) {
662 #if defined(__aarch64__)
663 return vreinterpret_s8_u32(
664 vzip2_u32(vreinterpret_u32_s8(a), vreinterpret_u32_s8(b)));
665 #else
666 // Discard |.val[0]|
667 return vreinterpret_s8_u32(
668 vzip_u32(vreinterpret_u32_s8(a), vreinterpret_u32_s8(b)).val[1]);
669 #endif
670 }
671
672 //------------------------------------------------------------------------------
673 // Sum.
674
SumVector(const uint8x8_t a)675 inline uint16_t SumVector(const uint8x8_t a) {
676 #if defined(__aarch64__)
677 return vaddlv_u8(a);
678 #else
679 const uint16x4_t c = vpaddl_u8(a);
680 const uint32x2_t d = vpaddl_u16(c);
681 const uint64x1_t e = vpaddl_u32(d);
682 return static_cast<uint16_t>(vget_lane_u64(e, 0));
683 #endif // defined(__aarch64__)
684 }
685
SumVector(const uint32x2_t a)686 inline uint32_t SumVector(const uint32x2_t a) {
687 #if defined(__aarch64__)
688 return vaddv_u32(a);
689 #else
690 const uint64x1_t b = vpaddl_u32(a);
691 return vget_lane_u32(vreinterpret_u32_u64(b), 0);
692 #endif // defined(__aarch64__)
693 }
694
SumVector(const uint32x4_t a)695 inline uint32_t SumVector(const uint32x4_t a) {
696 #if defined(__aarch64__)
697 return vaddvq_u32(a);
698 #else
699 const uint64x2_t b = vpaddlq_u32(a);
700 const uint64x1_t c = vadd_u64(vget_low_u64(b), vget_high_u64(b));
701 return static_cast<uint32_t>(vget_lane_u64(c, 0));
702 #endif
703 }
704
705 //------------------------------------------------------------------------------
706 // Transpose.
707
708 // Transpose 32 bit elements such that:
709 // a: 00 01
710 // b: 02 03
711 // returns
712 // val[0]: 00 02
713 // val[1]: 01 03
Interleave32(const uint8x8_t a,const uint8x8_t b)714 inline uint8x8x2_t Interleave32(const uint8x8_t a, const uint8x8_t b) {
715 const uint32x2_t a_32 = vreinterpret_u32_u8(a);
716 const uint32x2_t b_32 = vreinterpret_u32_u8(b);
717 const uint32x2x2_t c = vtrn_u32(a_32, b_32);
718 const uint8x8x2_t d = {vreinterpret_u8_u32(c.val[0]),
719 vreinterpret_u8_u32(c.val[1])};
720 return d;
721 }
722
723 // Swap high and low 32 bit elements.
Transpose32(const uint8x8_t a)724 inline uint8x8_t Transpose32(const uint8x8_t a) {
725 const uint32x2_t b = vrev64_u32(vreinterpret_u32_u8(a));
726 return vreinterpret_u8_u32(b);
727 }
728
729 // Swap high and low halves.
Transpose64(const uint16x8_t a)730 inline uint16x8_t Transpose64(const uint16x8_t a) { return vextq_u16(a, a, 4); }
731
732 // Implement vtrnq_s64().
733 // Input:
734 // a0: 00 01 02 03 04 05 06 07
735 // a1: 16 17 18 19 20 21 22 23
736 // Output:
737 // b0.val[0]: 00 01 02 03 16 17 18 19
738 // b0.val[1]: 04 05 06 07 20 21 22 23
VtrnqS64(const int32x4_t a0,const int32x4_t a1)739 inline int16x8x2_t VtrnqS64(const int32x4_t a0, const int32x4_t a1) {
740 int16x8x2_t b0;
741 b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
742 vreinterpret_s16_s32(vget_low_s32(a1)));
743 b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
744 vreinterpret_s16_s32(vget_high_s32(a1)));
745 return b0;
746 }
747
VtrnqU64(const uint32x4_t a0,const uint32x4_t a1)748 inline uint16x8x2_t VtrnqU64(const uint32x4_t a0, const uint32x4_t a1) {
749 uint16x8x2_t b0;
750 b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
751 vreinterpret_u16_u32(vget_low_u32(a1)));
752 b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
753 vreinterpret_u16_u32(vget_high_u32(a1)));
754 return b0;
755 }
756
757 // Input:
758 // 00 01 02 03
759 // 10 11 12 13
760 // 20 21 22 23
761 // 30 31 32 33
762 // Output:
763 // 00 10 20 30
764 // 01 11 21 31
765 // 02 12 22 32
766 // 03 13 23 33
Transpose4x4(uint16x4_t a[4])767 inline void Transpose4x4(uint16x4_t a[4]) {
768 // b:
769 // 00 10 02 12
770 // 01 11 03 13
771 const uint16x4x2_t b = vtrn_u16(a[0], a[1]);
772 // c:
773 // 20 30 22 32
774 // 21 31 23 33
775 const uint16x4x2_t c = vtrn_u16(a[2], a[3]);
776 // d:
777 // 00 10 20 30
778 // 02 12 22 32
779 const uint32x2x2_t d =
780 vtrn_u32(vreinterpret_u32_u16(b.val[0]), vreinterpret_u32_u16(c.val[0]));
781 // e:
782 // 01 11 21 31
783 // 03 13 23 33
784 const uint32x2x2_t e =
785 vtrn_u32(vreinterpret_u32_u16(b.val[1]), vreinterpret_u32_u16(c.val[1]));
786 a[0] = vreinterpret_u16_u32(d.val[0]);
787 a[1] = vreinterpret_u16_u32(e.val[0]);
788 a[2] = vreinterpret_u16_u32(d.val[1]);
789 a[3] = vreinterpret_u16_u32(e.val[1]);
790 }
791
792 // Input:
793 // a: 00 01 02 03 10 11 12 13
794 // b: 20 21 22 23 30 31 32 33
795 // Output:
796 // Note that columns [1] and [2] are transposed.
797 // a: 00 10 20 30 02 12 22 32
798 // b: 01 11 21 31 03 13 23 33
Transpose4x4(uint8x8_t * a,uint8x8_t * b)799 inline void Transpose4x4(uint8x8_t* a, uint8x8_t* b) {
800 const uint16x4x2_t c =
801 vtrn_u16(vreinterpret_u16_u8(*a), vreinterpret_u16_u8(*b));
802 const uint32x2x2_t d =
803 vtrn_u32(vreinterpret_u32_u16(c.val[0]), vreinterpret_u32_u16(c.val[1]));
804 const uint8x8x2_t e =
805 vtrn_u8(vreinterpret_u8_u32(d.val[0]), vreinterpret_u8_u32(d.val[1]));
806 *a = e.val[0];
807 *b = e.val[1];
808 }
809
810 // 4x8 Input:
811 // a[0]: 00 01 02 03 04 05 06 07
812 // a[1]: 10 11 12 13 14 15 16 17
813 // a[2]: 20 21 22 23 24 25 26 27
814 // a[3]: 30 31 32 33 34 35 36 37
815 // 8x4 Output:
816 // a[0]: 00 10 20 30 04 14 24 34
817 // a[1]: 01 11 21 31 05 15 25 35
818 // a[2]: 02 12 22 32 06 16 26 36
819 // a[3]: 03 13 23 33 07 17 27 37
Transpose4x8(uint16x8_t a[4])820 inline void Transpose4x8(uint16x8_t a[4]) {
821 // b0.val[0]: 00 10 02 12 04 14 06 16
822 // b0.val[1]: 01 11 03 13 05 15 07 17
823 // b1.val[0]: 20 30 22 32 24 34 26 36
824 // b1.val[1]: 21 31 23 33 25 35 27 37
825 const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
826 const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
827
828 // c0.val[0]: 00 10 20 30 04 14 24 34
829 // c0.val[1]: 02 12 22 32 06 16 26 36
830 // c1.val[0]: 01 11 21 31 05 15 25 35
831 // c1.val[1]: 03 13 23 33 07 17 27 37
832 const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
833 vreinterpretq_u32_u16(b1.val[0]));
834 const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
835 vreinterpretq_u32_u16(b1.val[1]));
836
837 a[0] = vreinterpretq_u16_u32(c0.val[0]);
838 a[1] = vreinterpretq_u16_u32(c1.val[0]);
839 a[2] = vreinterpretq_u16_u32(c0.val[1]);
840 a[3] = vreinterpretq_u16_u32(c1.val[1]);
841 }
842
843 // Special transpose for loop filter.
844 // 4x8 Input:
845 // p_q: p3 p2 p1 p0 q0 q1 q2 q3
846 // a[0]: 00 01 02 03 04 05 06 07
847 // a[1]: 10 11 12 13 14 15 16 17
848 // a[2]: 20 21 22 23 24 25 26 27
849 // a[3]: 30 31 32 33 34 35 36 37
850 // 8x4 Output:
851 // a[0]: 03 13 23 33 04 14 24 34 p0q0
852 // a[1]: 02 12 22 32 05 15 25 35 p1q1
853 // a[2]: 01 11 21 31 06 16 26 36 p2q2
854 // a[3]: 00 10 20 30 07 17 27 37 p3q3
855 // Direct reapplication of the function will reset the high halves, but
856 // reverse the low halves:
857 // p_q: p0 p1 p2 p3 q0 q1 q2 q3
858 // a[0]: 33 32 31 30 04 05 06 07
859 // a[1]: 23 22 21 20 14 15 16 17
860 // a[2]: 13 12 11 10 24 25 26 27
861 // a[3]: 03 02 01 00 34 35 36 37
862 // Simply reordering the inputs (3, 2, 1, 0) will reset the low halves, but
863 // reverse the high halves.
864 // The standard Transpose4x8 will produce the same reversals, but with the
865 // order of the low halves also restored relative to the high halves. This is
866 // preferable because it puts all values from the same source row back together,
867 // but some post-processing is inevitable.
LoopFilterTranspose4x8(uint16x8_t a[4])868 inline void LoopFilterTranspose4x8(uint16x8_t a[4]) {
869 // b0.val[0]: 00 10 02 12 04 14 06 16
870 // b0.val[1]: 01 11 03 13 05 15 07 17
871 // b1.val[0]: 20 30 22 32 24 34 26 36
872 // b1.val[1]: 21 31 23 33 25 35 27 37
873 const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
874 const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
875
876 // Reverse odd vectors to bring the appropriate items to the front of zips.
877 // b0.val[0]: 00 10 02 12 04 14 06 16
878 // r0 : 03 13 01 11 07 17 05 15
879 // b1.val[0]: 20 30 22 32 24 34 26 36
880 // r1 : 23 33 21 31 27 37 25 35
881 const uint32x4_t r0 = vrev64q_u32(vreinterpretq_u32_u16(b0.val[1]));
882 const uint32x4_t r1 = vrev64q_u32(vreinterpretq_u32_u16(b1.val[1]));
883
884 // Zip to complete the halves.
885 // c0.val[0]: 00 10 20 30 02 12 22 32 p3p1
886 // c0.val[1]: 04 14 24 34 06 16 26 36 q0q2
887 // c1.val[0]: 03 13 23 33 01 11 21 31 p0p2
888 // c1.val[1]: 07 17 27 37 05 15 25 35 q3q1
889 const uint32x4x2_t c0 = vzipq_u32(vreinterpretq_u32_u16(b0.val[0]),
890 vreinterpretq_u32_u16(b1.val[0]));
891 const uint32x4x2_t c1 = vzipq_u32(r0, r1);
892
893 // d0.val[0]: 00 10 20 30 07 17 27 37 p3q3
894 // d0.val[1]: 02 12 22 32 05 15 25 35 p1q1
895 // d1.val[0]: 03 13 23 33 04 14 24 34 p0q0
896 // d1.val[1]: 01 11 21 31 06 16 26 36 p2q2
897 const uint16x8x2_t d0 = VtrnqU64(c0.val[0], c1.val[1]);
898 // The third row of c comes first here to swap p2 with q0.
899 const uint16x8x2_t d1 = VtrnqU64(c1.val[0], c0.val[1]);
900
901 // 8x4 Output:
902 // a[0]: 03 13 23 33 04 14 24 34 p0q0
903 // a[1]: 02 12 22 32 05 15 25 35 p1q1
904 // a[2]: 01 11 21 31 06 16 26 36 p2q2
905 // a[3]: 00 10 20 30 07 17 27 37 p3q3
906 a[0] = d1.val[0]; // p0q0
907 a[1] = d0.val[1]; // p1q1
908 a[2] = d1.val[1]; // p2q2
909 a[3] = d0.val[0]; // p3q3
910 }
911
912 // Reversible if the x4 values are packed next to each other.
913 // x4 input / x8 output:
914 // a0: 00 01 02 03 40 41 42 43 44
915 // a1: 10 11 12 13 50 51 52 53 54
916 // a2: 20 21 22 23 60 61 62 63 64
917 // a3: 30 31 32 33 70 71 72 73 74
918 // x8 input / x4 output:
919 // a0: 00 10 20 30 40 50 60 70
920 // a1: 01 11 21 31 41 51 61 71
921 // a2: 02 12 22 32 42 52 62 72
922 // a3: 03 13 23 33 43 53 63 73
Transpose8x4(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3)923 inline void Transpose8x4(uint8x8_t* a0, uint8x8_t* a1, uint8x8_t* a2,
924 uint8x8_t* a3) {
925 const uint8x8x2_t b0 = vtrn_u8(*a0, *a1);
926 const uint8x8x2_t b1 = vtrn_u8(*a2, *a3);
927
928 const uint16x4x2_t c0 =
929 vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0]));
930 const uint16x4x2_t c1 =
931 vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1]));
932
933 *a0 = vreinterpret_u8_u16(c0.val[0]);
934 *a1 = vreinterpret_u8_u16(c1.val[0]);
935 *a2 = vreinterpret_u8_u16(c0.val[1]);
936 *a3 = vreinterpret_u8_u16(c1.val[1]);
937 }
938
939 // Input:
940 // a[0]: 00 01 02 03 04 05 06 07
941 // a[1]: 10 11 12 13 14 15 16 17
942 // a[2]: 20 21 22 23 24 25 26 27
943 // a[3]: 30 31 32 33 34 35 36 37
944 // a[4]: 40 41 42 43 44 45 46 47
945 // a[5]: 50 51 52 53 54 55 56 57
946 // a[6]: 60 61 62 63 64 65 66 67
947 // a[7]: 70 71 72 73 74 75 76 77
948
949 // Output:
950 // a[0]: 00 10 20 30 40 50 60 70
951 // a[1]: 01 11 21 31 41 51 61 71
952 // a[2]: 02 12 22 32 42 52 62 72
953 // a[3]: 03 13 23 33 43 53 63 73
954 // a[4]: 04 14 24 34 44 54 64 74
955 // a[5]: 05 15 25 35 45 55 65 75
956 // a[6]: 06 16 26 36 46 56 66 76
957 // a[7]: 07 17 27 37 47 57 67 77
Transpose8x8(int8x8_t a[8])958 inline void Transpose8x8(int8x8_t a[8]) {
959 // Swap 8 bit elements. Goes from:
960 // a[0]: 00 01 02 03 04 05 06 07
961 // a[1]: 10 11 12 13 14 15 16 17
962 // a[2]: 20 21 22 23 24 25 26 27
963 // a[3]: 30 31 32 33 34 35 36 37
964 // a[4]: 40 41 42 43 44 45 46 47
965 // a[5]: 50 51 52 53 54 55 56 57
966 // a[6]: 60 61 62 63 64 65 66 67
967 // a[7]: 70 71 72 73 74 75 76 77
968 // to:
969 // b0.val[0]: 00 10 02 12 04 14 06 16 40 50 42 52 44 54 46 56
970 // b0.val[1]: 01 11 03 13 05 15 07 17 41 51 43 53 45 55 47 57
971 // b1.val[0]: 20 30 22 32 24 34 26 36 60 70 62 72 64 74 66 76
972 // b1.val[1]: 21 31 23 33 25 35 27 37 61 71 63 73 65 75 67 77
973 const int8x16x2_t b0 =
974 vtrnq_s8(vcombine_s8(a[0], a[4]), vcombine_s8(a[1], a[5]));
975 const int8x16x2_t b1 =
976 vtrnq_s8(vcombine_s8(a[2], a[6]), vcombine_s8(a[3], a[7]));
977
978 // Swap 16 bit elements resulting in:
979 // c0.val[0]: 00 10 20 30 04 14 24 34 40 50 60 70 44 54 64 74
980 // c0.val[1]: 02 12 22 32 06 16 26 36 42 52 62 72 46 56 66 76
981 // c1.val[0]: 01 11 21 31 05 15 25 35 41 51 61 71 45 55 65 75
982 // c1.val[1]: 03 13 23 33 07 17 27 37 43 53 63 73 47 57 67 77
983 const int16x8x2_t c0 = vtrnq_s16(vreinterpretq_s16_s8(b0.val[0]),
984 vreinterpretq_s16_s8(b1.val[0]));
985 const int16x8x2_t c1 = vtrnq_s16(vreinterpretq_s16_s8(b0.val[1]),
986 vreinterpretq_s16_s8(b1.val[1]));
987
988 // Unzip 32 bit elements resulting in:
989 // d0.val[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
990 // d0.val[1]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
991 // d1.val[0]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
992 // d1.val[1]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
993 const int32x4x2_t d0 = vuzpq_s32(vreinterpretq_s32_s16(c0.val[0]),
994 vreinterpretq_s32_s16(c1.val[0]));
995 const int32x4x2_t d1 = vuzpq_s32(vreinterpretq_s32_s16(c0.val[1]),
996 vreinterpretq_s32_s16(c1.val[1]));
997
998 a[0] = vreinterpret_s8_s32(vget_low_s32(d0.val[0]));
999 a[1] = vreinterpret_s8_s32(vget_high_s32(d0.val[0]));
1000 a[2] = vreinterpret_s8_s32(vget_low_s32(d1.val[0]));
1001 a[3] = vreinterpret_s8_s32(vget_high_s32(d1.val[0]));
1002 a[4] = vreinterpret_s8_s32(vget_low_s32(d0.val[1]));
1003 a[5] = vreinterpret_s8_s32(vget_high_s32(d0.val[1]));
1004 a[6] = vreinterpret_s8_s32(vget_low_s32(d1.val[1]));
1005 a[7] = vreinterpret_s8_s32(vget_high_s32(d1.val[1]));
1006 }
1007
1008 // Unsigned.
Transpose8x8(uint8x8_t a[8])1009 inline void Transpose8x8(uint8x8_t a[8]) {
1010 const uint8x16x2_t b0 =
1011 vtrnq_u8(vcombine_u8(a[0], a[4]), vcombine_u8(a[1], a[5]));
1012 const uint8x16x2_t b1 =
1013 vtrnq_u8(vcombine_u8(a[2], a[6]), vcombine_u8(a[3], a[7]));
1014
1015 const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
1016 vreinterpretq_u16_u8(b1.val[0]));
1017 const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
1018 vreinterpretq_u16_u8(b1.val[1]));
1019
1020 const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
1021 vreinterpretq_u32_u16(c1.val[0]));
1022 const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
1023 vreinterpretq_u32_u16(c1.val[1]));
1024
1025 a[0] = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
1026 a[1] = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
1027 a[2] = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
1028 a[3] = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
1029 a[4] = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
1030 a[5] = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
1031 a[6] = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
1032 a[7] = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
1033 }
1034
Transpose8x8(uint8x8_t in[8],uint8x16_t out[4])1035 inline void Transpose8x8(uint8x8_t in[8], uint8x16_t out[4]) {
1036 const uint8x16x2_t a0 =
1037 vtrnq_u8(vcombine_u8(in[0], in[4]), vcombine_u8(in[1], in[5]));
1038 const uint8x16x2_t a1 =
1039 vtrnq_u8(vcombine_u8(in[2], in[6]), vcombine_u8(in[3], in[7]));
1040
1041 const uint16x8x2_t b0 = vtrnq_u16(vreinterpretq_u16_u8(a0.val[0]),
1042 vreinterpretq_u16_u8(a1.val[0]));
1043 const uint16x8x2_t b1 = vtrnq_u16(vreinterpretq_u16_u8(a0.val[1]),
1044 vreinterpretq_u16_u8(a1.val[1]));
1045
1046 const uint32x4x2_t c0 = vuzpq_u32(vreinterpretq_u32_u16(b0.val[0]),
1047 vreinterpretq_u32_u16(b1.val[0]));
1048 const uint32x4x2_t c1 = vuzpq_u32(vreinterpretq_u32_u16(b0.val[1]),
1049 vreinterpretq_u32_u16(b1.val[1]));
1050
1051 out[0] = vreinterpretq_u8_u32(c0.val[0]);
1052 out[1] = vreinterpretq_u8_u32(c1.val[0]);
1053 out[2] = vreinterpretq_u8_u32(c0.val[1]);
1054 out[3] = vreinterpretq_u8_u32(c1.val[1]);
1055 }
1056
1057 // Input:
1058 // a[0]: 00 01 02 03 04 05 06 07
1059 // a[1]: 10 11 12 13 14 15 16 17
1060 // a[2]: 20 21 22 23 24 25 26 27
1061 // a[3]: 30 31 32 33 34 35 36 37
1062 // a[4]: 40 41 42 43 44 45 46 47
1063 // a[5]: 50 51 52 53 54 55 56 57
1064 // a[6]: 60 61 62 63 64 65 66 67
1065 // a[7]: 70 71 72 73 74 75 76 77
1066
1067 // Output:
1068 // a[0]: 00 10 20 30 40 50 60 70
1069 // a[1]: 01 11 21 31 41 51 61 71
1070 // a[2]: 02 12 22 32 42 52 62 72
1071 // a[3]: 03 13 23 33 43 53 63 73
1072 // a[4]: 04 14 24 34 44 54 64 74
1073 // a[5]: 05 15 25 35 45 55 65 75
1074 // a[6]: 06 16 26 36 46 56 66 76
1075 // a[7]: 07 17 27 37 47 57 67 77
Transpose8x8(int16x8_t a[8])1076 inline void Transpose8x8(int16x8_t a[8]) {
1077 const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]);
1078 const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]);
1079 const int16x8x2_t b2 = vtrnq_s16(a[4], a[5]);
1080 const int16x8x2_t b3 = vtrnq_s16(a[6], a[7]);
1081
1082 const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
1083 vreinterpretq_s32_s16(b1.val[0]));
1084 const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
1085 vreinterpretq_s32_s16(b1.val[1]));
1086 const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
1087 vreinterpretq_s32_s16(b3.val[0]));
1088 const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
1089 vreinterpretq_s32_s16(b3.val[1]));
1090
1091 const int16x8x2_t d0 = VtrnqS64(c0.val[0], c2.val[0]);
1092 const int16x8x2_t d1 = VtrnqS64(c1.val[0], c3.val[0]);
1093 const int16x8x2_t d2 = VtrnqS64(c0.val[1], c2.val[1]);
1094 const int16x8x2_t d3 = VtrnqS64(c1.val[1], c3.val[1]);
1095
1096 a[0] = d0.val[0];
1097 a[1] = d1.val[0];
1098 a[2] = d2.val[0];
1099 a[3] = d3.val[0];
1100 a[4] = d0.val[1];
1101 a[5] = d1.val[1];
1102 a[6] = d2.val[1];
1103 a[7] = d3.val[1];
1104 }
1105
1106 // Unsigned.
Transpose8x8(uint16x8_t a[8])1107 inline void Transpose8x8(uint16x8_t a[8]) {
1108 const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
1109 const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
1110 const uint16x8x2_t b2 = vtrnq_u16(a[4], a[5]);
1111 const uint16x8x2_t b3 = vtrnq_u16(a[6], a[7]);
1112
1113 const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
1114 vreinterpretq_u32_u16(b1.val[0]));
1115 const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
1116 vreinterpretq_u32_u16(b1.val[1]));
1117 const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]),
1118 vreinterpretq_u32_u16(b3.val[0]));
1119 const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]),
1120 vreinterpretq_u32_u16(b3.val[1]));
1121
1122 const uint16x8x2_t d0 = VtrnqU64(c0.val[0], c2.val[0]);
1123 const uint16x8x2_t d1 = VtrnqU64(c1.val[0], c3.val[0]);
1124 const uint16x8x2_t d2 = VtrnqU64(c0.val[1], c2.val[1]);
1125 const uint16x8x2_t d3 = VtrnqU64(c1.val[1], c3.val[1]);
1126
1127 a[0] = d0.val[0];
1128 a[1] = d1.val[0];
1129 a[2] = d2.val[0];
1130 a[3] = d3.val[0];
1131 a[4] = d0.val[1];
1132 a[5] = d1.val[1];
1133 a[6] = d2.val[1];
1134 a[7] = d3.val[1];
1135 }
1136
1137 // Input:
1138 // a[0]: 00 01 02 03 04 05 06 07 80 81 82 83 84 85 86 87
1139 // a[1]: 10 11 12 13 14 15 16 17 90 91 92 93 94 95 96 97
1140 // a[2]: 20 21 22 23 24 25 26 27 a0 a1 a2 a3 a4 a5 a6 a7
1141 // a[3]: 30 31 32 33 34 35 36 37 b0 b1 b2 b3 b4 b5 b6 b7
1142 // a[4]: 40 41 42 43 44 45 46 47 c0 c1 c2 c3 c4 c5 c6 c7
1143 // a[5]: 50 51 52 53 54 55 56 57 d0 d1 d2 d3 d4 d5 d6 d7
1144 // a[6]: 60 61 62 63 64 65 66 67 e0 e1 e2 e3 e4 e5 e6 e7
1145 // a[7]: 70 71 72 73 74 75 76 77 f0 f1 f2 f3 f4 f5 f6 f7
1146
1147 // Output:
1148 // a[0]: 00 10 20 30 40 50 60 70 80 90 a0 b0 c0 d0 e0 f0
1149 // a[1]: 01 11 21 31 41 51 61 71 81 91 a1 b1 c1 d1 e1 f1
1150 // a[2]: 02 12 22 32 42 52 62 72 82 92 a2 b2 c2 d2 e2 f2
1151 // a[3]: 03 13 23 33 43 53 63 73 83 93 a3 b3 c3 d3 e3 f3
1152 // a[4]: 04 14 24 34 44 54 64 74 84 94 a4 b4 c4 d4 e4 f4
1153 // a[5]: 05 15 25 35 45 55 65 75 85 95 a5 b5 c5 d5 e5 f5
1154 // a[6]: 06 16 26 36 46 56 66 76 86 96 a6 b6 c6 d6 e6 f6
1155 // a[7]: 07 17 27 37 47 57 67 77 87 97 a7 b7 c7 d7 e7 f7
Transpose8x16(uint8x16_t a[8])1156 inline void Transpose8x16(uint8x16_t a[8]) {
1157 // b0.val[0]: 00 10 02 12 04 14 06 16 80 90 82 92 84 94 86 96
1158 // b0.val[1]: 01 11 03 13 05 15 07 17 81 91 83 93 85 95 87 97
1159 // b1.val[0]: 20 30 22 32 24 34 26 36 a0 b0 a2 b2 a4 b4 a6 b6
1160 // b1.val[1]: 21 31 23 33 25 35 27 37 a1 b1 a3 b3 a5 b5 a7 b7
1161 // b2.val[0]: 40 50 42 52 44 54 46 56 c0 d0 c2 d2 c4 d4 c6 d6
1162 // b2.val[1]: 41 51 43 53 45 55 47 57 c1 d1 c3 d3 c5 d5 c7 d7
1163 // b3.val[0]: 60 70 62 72 64 74 66 76 e0 f0 e2 f2 e4 f4 e6 f6
1164 // b3.val[1]: 61 71 63 73 65 75 67 77 e1 f1 e3 f3 e5 f5 e7 f7
1165 const uint8x16x2_t b0 = vtrnq_u8(a[0], a[1]);
1166 const uint8x16x2_t b1 = vtrnq_u8(a[2], a[3]);
1167 const uint8x16x2_t b2 = vtrnq_u8(a[4], a[5]);
1168 const uint8x16x2_t b3 = vtrnq_u8(a[6], a[7]);
1169
1170 // c0.val[0]: 00 10 20 30 04 14 24 34 80 90 a0 b0 84 94 a4 b4
1171 // c0.val[1]: 02 12 22 32 06 16 26 36 82 92 a2 b2 86 96 a6 b6
1172 // c1.val[0]: 01 11 21 31 05 15 25 35 81 91 a1 b1 85 95 a5 b5
1173 // c1.val[1]: 03 13 23 33 07 17 27 37 83 93 a3 b3 87 97 a7 b7
1174 // c2.val[0]: 40 50 60 70 44 54 64 74 c0 d0 e0 f0 c4 d4 e4 f4
1175 // c2.val[1]: 42 52 62 72 46 56 66 76 c2 d2 e2 f2 c6 d6 e6 f6
1176 // c3.val[0]: 41 51 61 71 45 55 65 75 c1 d1 e1 f1 c5 d5 e5 f5
1177 // c3.val[1]: 43 53 63 73 47 57 67 77 c3 d3 e3 f3 c7 d7 e7 f7
1178 const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
1179 vreinterpretq_u16_u8(b1.val[0]));
1180 const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
1181 vreinterpretq_u16_u8(b1.val[1]));
1182 const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]),
1183 vreinterpretq_u16_u8(b3.val[0]));
1184 const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]),
1185 vreinterpretq_u16_u8(b3.val[1]));
1186
1187 // d0.val[0]: 00 10 20 30 40 50 60 70 80 90 a0 b0 c0 d0 e0 f0
1188 // d0.val[1]: 04 14 24 34 44 54 64 74 84 94 a4 b4 c4 d4 e4 f4
1189 // d1.val[0]: 01 11 21 31 41 51 61 71 81 91 a1 b1 c1 d1 e1 f1
1190 // d1.val[1]: 05 15 25 35 45 55 65 75 85 95 a5 b5 c5 d5 e5 f5
1191 // d2.val[0]: 02 12 22 32 42 52 62 72 82 92 a2 b2 c2 d2 e2 f2
1192 // d2.val[1]: 06 16 26 36 46 56 66 76 86 96 a6 b6 c6 d6 e6 f6
1193 // d3.val[0]: 03 13 23 33 43 53 63 73 83 93 a3 b3 c3 d3 e3 f3
1194 // d3.val[1]: 07 17 27 37 47 57 67 77 87 97 a7 b7 c7 d7 e7 f7
1195 const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
1196 vreinterpretq_u32_u16(c2.val[0]));
1197 const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]),
1198 vreinterpretq_u32_u16(c3.val[0]));
1199 const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
1200 vreinterpretq_u32_u16(c2.val[1]));
1201 const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]),
1202 vreinterpretq_u32_u16(c3.val[1]));
1203
1204 a[0] = vreinterpretq_u8_u32(d0.val[0]);
1205 a[1] = vreinterpretq_u8_u32(d1.val[0]);
1206 a[2] = vreinterpretq_u8_u32(d2.val[0]);
1207 a[3] = vreinterpretq_u8_u32(d3.val[0]);
1208 a[4] = vreinterpretq_u8_u32(d0.val[1]);
1209 a[5] = vreinterpretq_u8_u32(d1.val[1]);
1210 a[6] = vreinterpretq_u8_u32(d2.val[1]);
1211 a[7] = vreinterpretq_u8_u32(d3.val[1]);
1212 }
1213
ZeroExtend(const uint8x8_t in)1214 inline int16x8_t ZeroExtend(const uint8x8_t in) {
1215 return vreinterpretq_s16_u16(vmovl_u8(in));
1216 }
1217
1218 } // namespace dsp
1219 } // namespace libgav1
1220
1221 #endif // LIBGAV1_ENABLE_NEON
1222 #endif // LIBGAV1_SRC_DSP_ARM_COMMON_NEON_H_
1223