• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2019 The libgav1 Authors
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef LIBGAV1_SRC_DSP_ARM_COMMON_NEON_H_
18 #define LIBGAV1_SRC_DSP_ARM_COMMON_NEON_H_
19 
20 #include "src/utils/cpu.h"
21 
22 #if LIBGAV1_ENABLE_NEON
23 
24 #include <arm_neon.h>
25 
26 #include <algorithm>
27 #include <cstddef>
28 #include <cstdint>
29 #include <cstring>
30 
31 #include "src/utils/compiler_attributes.h"
32 
33 #if 0
34 #include <cstdio>
35 #include <string>
36 
37 constexpr bool kEnablePrintRegs = true;
38 
39 union DebugRegister {
40   int8_t i8[8];
41   int16_t i16[4];
42   int32_t i32[2];
43   uint8_t u8[8];
44   uint16_t u16[4];
45   uint32_t u32[2];
46 };
47 
48 union DebugRegisterQ {
49   int8_t i8[16];
50   int16_t i16[8];
51   int32_t i32[4];
52   uint8_t u8[16];
53   uint16_t u16[8];
54   uint32_t u32[4];
55 };
56 
57 // Quite useful macro for debugging. Left here for convenience.
58 inline void PrintVect(const DebugRegister r, const char* const name, int size) {
59   int n;
60   if (kEnablePrintRegs) {
61     fprintf(stderr, "%s\t: ", name);
62     if (size == 8) {
63       for (n = 0; n < 8; ++n) fprintf(stderr, "%.2x ", r.u8[n]);
64     } else if (size == 16) {
65       for (n = 0; n < 4; ++n) fprintf(stderr, "%.4x ", r.u16[n]);
66     } else if (size == 32) {
67       for (n = 0; n < 2; ++n) fprintf(stderr, "%.8x ", r.u32[n]);
68     }
69     fprintf(stderr, "\n");
70   }
71 }
72 
73 // Debugging macro for 128-bit types.
74 inline void PrintVectQ(const DebugRegisterQ r, const char* const name,
75                        int size) {
76   int n;
77   if (kEnablePrintRegs) {
78     fprintf(stderr, "%s\t: ", name);
79     if (size == 8) {
80       for (n = 0; n < 16; ++n) fprintf(stderr, "%.2x ", r.u8[n]);
81     } else if (size == 16) {
82       for (n = 0; n < 8; ++n) fprintf(stderr, "%.4x ", r.u16[n]);
83     } else if (size == 32) {
84       for (n = 0; n < 4; ++n) fprintf(stderr, "%.8x ", r.u32[n]);
85     }
86     fprintf(stderr, "\n");
87   }
88 }
89 
90 inline void PrintReg(const int32x4x2_t val, const std::string& name) {
91   DebugRegisterQ r;
92   vst1q_s32(r.i32, val.val[0]);
93   const std::string name0 = name + std::string(".val[0]");
94   PrintVectQ(r, name0.c_str(), 32);
95   vst1q_s32(r.i32, val.val[1]);
96   const std::string name1 = name + std::string(".val[1]");
97   PrintVectQ(r, name1.c_str(), 32);
98 }
99 
100 inline void PrintReg(const uint32x4_t val, const char* name) {
101   DebugRegisterQ r;
102   vst1q_u32(r.u32, val);
103   PrintVectQ(r, name, 32);
104 }
105 
106 inline void PrintReg(const uint32x2_t val, const char* name) {
107   DebugRegister r;
108   vst1_u32(r.u32, val);
109   PrintVect(r, name, 32);
110 }
111 
112 inline void PrintReg(const uint16x8_t val, const char* name) {
113   DebugRegisterQ r;
114   vst1q_u16(r.u16, val);
115   PrintVectQ(r, name, 16);
116 }
117 
118 inline void PrintReg(const uint16x4_t val, const char* name) {
119   DebugRegister r;
120   vst1_u16(r.u16, val);
121   PrintVect(r, name, 16);
122 }
123 
124 inline void PrintReg(const uint8x16_t val, const char* name) {
125   DebugRegisterQ r;
126   vst1q_u8(r.u8, val);
127   PrintVectQ(r, name, 8);
128 }
129 
130 inline void PrintReg(const uint8x8_t val, const char* name) {
131   DebugRegister r;
132   vst1_u8(r.u8, val);
133   PrintVect(r, name, 8);
134 }
135 
136 inline void PrintReg(const int32x4_t val, const char* name) {
137   DebugRegisterQ r;
138   vst1q_s32(r.i32, val);
139   PrintVectQ(r, name, 32);
140 }
141 
142 inline void PrintReg(const int32x2_t val, const char* name) {
143   DebugRegister r;
144   vst1_s32(r.i32, val);
145   PrintVect(r, name, 32);
146 }
147 
148 inline void PrintReg(const int16x8_t val, const char* name) {
149   DebugRegisterQ r;
150   vst1q_s16(r.i16, val);
151   PrintVectQ(r, name, 16);
152 }
153 
154 inline void PrintReg(const int16x4_t val, const char* name) {
155   DebugRegister r;
156   vst1_s16(r.i16, val);
157   PrintVect(r, name, 16);
158 }
159 
160 inline void PrintReg(const int8x16_t val, const char* name) {
161   DebugRegisterQ r;
162   vst1q_s8(r.i8, val);
163   PrintVectQ(r, name, 8);
164 }
165 
166 inline void PrintReg(const int8x8_t val, const char* name) {
167   DebugRegister r;
168   vst1_s8(r.i8, val);
169   PrintVect(r, name, 8);
170 }
171 
172 // Print an individual (non-vector) value in decimal format.
173 inline void PrintReg(const int x, const char* name) {
174   if (kEnablePrintRegs) {
175     fprintf(stderr, "%s: %d\n", name, x);
176   }
177 }
178 
179 // Print an individual (non-vector) value in hexadecimal format.
180 inline void PrintHex(const int x, const char* name) {
181   if (kEnablePrintRegs) {
182     fprintf(stderr, "%s: %x\n", name, x);
183   }
184 }
185 
186 #define PR(x) PrintReg(x, #x)
187 #define PD(x) PrintReg(x, #x)
188 #define PX(x) PrintHex(x, #x)
189 
190 #if LIBGAV1_MSAN
191 #include <sanitizer/msan_interface.h>
192 
193 inline void PrintShadow(const void* r, const char* const name,
194                         const size_t size) {
195   if (kEnablePrintRegs) {
196     fprintf(stderr, "Shadow for %s:\n", name);
197     __msan_print_shadow(r, size);
198   }
199 }
200 #define PS(var, N) PrintShadow(var, #var, N)
201 
202 #endif  // LIBGAV1_MSAN
203 
204 #endif  // 0
205 
206 namespace libgav1 {
207 namespace dsp {
208 
209 //------------------------------------------------------------------------------
210 // Load functions.
211 
212 // Load 2 uint8_t values into lanes 0 and 1. Zeros the register before loading
213 // the values. Use caution when using this in loops because it will re-zero the
214 // register before loading on every iteration.
Load2(const void * const buf)215 inline uint8x8_t Load2(const void* const buf) {
216   const uint16x4_t zero = vdup_n_u16(0);
217   uint16_t temp;
218   memcpy(&temp, buf, 2);
219   return vreinterpret_u8_u16(vld1_lane_u16(&temp, zero, 0));
220 }
221 
222 // Load 2 uint8_t values into |lane| * 2 and |lane| * 2 + 1.
223 template <int lane>
Load2(const void * const buf,uint8x8_t val)224 inline uint8x8_t Load2(const void* const buf, uint8x8_t val) {
225   uint16_t temp;
226   memcpy(&temp, buf, 2);
227   return vreinterpret_u8_u16(
228       vld1_lane_u16(&temp, vreinterpret_u16_u8(val), lane));
229 }
230 
231 template <int lane>
Load2(const void * const buf,uint16x4_t val)232 inline uint16x4_t Load2(const void* const buf, uint16x4_t val) {
233   uint32_t temp;
234   memcpy(&temp, buf, 4);
235   return vreinterpret_u16_u32(
236       vld1_lane_u32(&temp, vreinterpret_u32_u16(val), lane));
237 }
238 
239 // Load 4 uint8_t values into the low half of a uint8x8_t register. Zeros the
240 // register before loading the values. Use caution when using this in loops
241 // because it will re-zero the register before loading on every iteration.
Load4(const void * const buf)242 inline uint8x8_t Load4(const void* const buf) {
243   const uint32x2_t zero = vdup_n_u32(0);
244   uint32_t temp;
245   memcpy(&temp, buf, 4);
246   return vreinterpret_u8_u32(vld1_lane_u32(&temp, zero, 0));
247 }
248 
249 // Load 4 uint8_t values into 4 lanes staring with |lane| * 4.
250 template <int lane>
Load4(const void * const buf,uint8x8_t val)251 inline uint8x8_t Load4(const void* const buf, uint8x8_t val) {
252   uint32_t temp;
253   memcpy(&temp, buf, 4);
254   return vreinterpret_u8_u32(
255       vld1_lane_u32(&temp, vreinterpret_u32_u8(val), lane));
256 }
257 
258 // Convenience functions for 16-bit loads from a uint8_t* source.
Load4U16(const void * const buf)259 inline uint16x4_t Load4U16(const void* const buf) {
260   return vld1_u16(static_cast<const uint16_t*>(buf));
261 }
262 
Load8U16(const void * const buf)263 inline uint16x8_t Load8U16(const void* const buf) {
264   return vld1q_u16(static_cast<const uint16_t*>(buf));
265 }
266 
267 //------------------------------------------------------------------------------
268 // Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning.
269 
MaskOverreads(const uint8x8_t source,const ptrdiff_t over_read_in_bytes)270 inline uint8x8_t MaskOverreads(const uint8x8_t source,
271                                const ptrdiff_t over_read_in_bytes) {
272   uint8x8_t dst = source;
273 #if LIBGAV1_MSAN
274   if (over_read_in_bytes > 0) {
275     uint8x8_t mask = vdup_n_u8(0);
276     uint8x8_t valid_element_mask = vdup_n_u8(-1);
277     const int valid_bytes =
278         std::min(8, 8 - static_cast<int>(over_read_in_bytes));
279     for (int i = 0; i < valid_bytes; ++i) {
280       // Feed ff bytes into |mask| one at a time.
281       mask = vext_u8(valid_element_mask, mask, 7);
282     }
283     dst = vand_u8(dst, mask);
284   }
285 #else
286   static_cast<void>(over_read_in_bytes);
287 #endif
288   return dst;
289 }
290 
MaskOverreadsQ(const uint8x16_t source,const ptrdiff_t over_read_in_bytes)291 inline uint8x16_t MaskOverreadsQ(const uint8x16_t source,
292                                  const ptrdiff_t over_read_in_bytes) {
293   uint8x16_t dst = source;
294 #if LIBGAV1_MSAN
295   if (over_read_in_bytes > 0) {
296     uint8x16_t mask = vdupq_n_u8(0);
297     uint8x16_t valid_element_mask = vdupq_n_u8(-1);
298     const int valid_bytes =
299         std::min(16, 16 - static_cast<int>(over_read_in_bytes));
300     for (int i = 0; i < valid_bytes; ++i) {
301       // Feed ff bytes into |mask| one at a time.
302       mask = vextq_u8(valid_element_mask, mask, 15);
303     }
304     dst = vandq_u8(dst, mask);
305   }
306 #else
307   static_cast<void>(over_read_in_bytes);
308 #endif
309   return dst;
310 }
311 
MaskOverreadsQ(const uint16x8_t source,const ptrdiff_t over_read_in_bytes)312 inline uint16x8_t MaskOverreadsQ(const uint16x8_t source,
313                                  const ptrdiff_t over_read_in_bytes) {
314   return vreinterpretq_u16_u8(
315       MaskOverreadsQ(vreinterpretq_u8_u16(source), over_read_in_bytes));
316 }
317 
Load1MsanU8(const uint8_t * const source,const ptrdiff_t over_read_in_bytes)318 inline uint8x8_t Load1MsanU8(const uint8_t* const source,
319                              const ptrdiff_t over_read_in_bytes) {
320   return MaskOverreads(vld1_u8(source), over_read_in_bytes);
321 }
322 
Load1QMsanU8(const uint8_t * const source,const ptrdiff_t over_read_in_bytes)323 inline uint8x16_t Load1QMsanU8(const uint8_t* const source,
324                                const ptrdiff_t over_read_in_bytes) {
325   return MaskOverreadsQ(vld1q_u8(source), over_read_in_bytes);
326 }
327 
Load1QMsanU16(const uint16_t * const source,const ptrdiff_t over_read_in_bytes)328 inline uint16x8_t Load1QMsanU16(const uint16_t* const source,
329                                 const ptrdiff_t over_read_in_bytes) {
330   return vreinterpretq_u16_u8(MaskOverreadsQ(
331       vreinterpretq_u8_u16(vld1q_u16(source)), over_read_in_bytes));
332 }
333 
Load1QMsanU32(const uint32_t * const source,const ptrdiff_t over_read_in_bytes)334 inline uint32x4_t Load1QMsanU32(const uint32_t* const source,
335                                 const ptrdiff_t over_read_in_bytes) {
336   return vreinterpretq_u32_u8(MaskOverreadsQ(
337       vreinterpretq_u8_u32(vld1q_u32(source)), over_read_in_bytes));
338 }
339 
340 //------------------------------------------------------------------------------
341 // Store functions.
342 
343 // Propagate type information to the compiler. Without this the compiler may
344 // assume the required alignment of the type (4 bytes in the case of uint32_t)
345 // and add alignment hints to the memory access.
346 template <typename T>
ValueToMem(void * const buf,T val)347 inline void ValueToMem(void* const buf, T val) {
348   memcpy(buf, &val, sizeof(val));
349 }
350 
351 // Store 4 int8_t values from the low half of an int8x8_t register.
StoreLo4(void * const buf,const int8x8_t val)352 inline void StoreLo4(void* const buf, const int8x8_t val) {
353   ValueToMem<int32_t>(buf, vget_lane_s32(vreinterpret_s32_s8(val), 0));
354 }
355 
356 // Store 4 uint8_t values from the low half of a uint8x8_t register.
StoreLo4(void * const buf,const uint8x8_t val)357 inline void StoreLo4(void* const buf, const uint8x8_t val) {
358   ValueToMem<uint32_t>(buf, vget_lane_u32(vreinterpret_u32_u8(val), 0));
359 }
360 
361 // Store 4 uint8_t values from the high half of a uint8x8_t register.
StoreHi4(void * const buf,const uint8x8_t val)362 inline void StoreHi4(void* const buf, const uint8x8_t val) {
363   ValueToMem<uint32_t>(buf, vget_lane_u32(vreinterpret_u32_u8(val), 1));
364 }
365 
366 // Store 2 uint8_t values from |lane| * 2 and |lane| * 2 + 1 of a uint8x8_t
367 // register.
368 template <int lane>
Store2(void * const buf,const uint8x8_t val)369 inline void Store2(void* const buf, const uint8x8_t val) {
370   ValueToMem<uint16_t>(buf, vget_lane_u16(vreinterpret_u16_u8(val), lane));
371 }
372 
373 // Store 2 uint16_t values from |lane| * 2 and |lane| * 2 + 1 of a uint16x8_t
374 // register.
375 template <int lane>
Store2(void * const buf,const uint16x8_t val)376 inline void Store2(void* const buf, const uint16x8_t val) {
377   ValueToMem<uint32_t>(buf, vgetq_lane_u32(vreinterpretq_u32_u16(val), lane));
378 }
379 
380 // Store 2 uint16_t values from |lane| * 2 and |lane| * 2 + 1 of a uint16x4_t
381 // register.
382 template <int lane>
Store2(void * const buf,const uint16x4_t val)383 inline void Store2(void* const buf, const uint16x4_t val) {
384   ValueToMem<uint32_t>(buf, vget_lane_u32(vreinterpret_u32_u16(val), lane));
385 }
386 
387 // Simplify code when caller has |buf| cast as uint8_t*.
Store4(void * const buf,const uint16x4_t val)388 inline void Store4(void* const buf, const uint16x4_t val) {
389   vst1_u16(static_cast<uint16_t*>(buf), val);
390 }
391 
392 // Simplify code when caller has |buf| cast as uint8_t*.
Store8(void * const buf,const uint16x8_t val)393 inline void Store8(void* const buf, const uint16x8_t val) {
394   vst1q_u16(static_cast<uint16_t*>(buf), val);
395 }
396 
Store4QMsanS16(void * const buf,const int16x8x4_t src)397 inline void Store4QMsanS16(void* const buf, const int16x8x4_t src) {
398 #if LIBGAV1_MSAN
399   // The memory shadow is incorrect for vst4q_u16, only marking the first 16
400   // bytes of the destination as initialized. To avoid missing truly
401   // uninitialized memory, check the input vectors first, before marking the
402   // whole 64 bytes initialized. If any input vector contains unused values, it
403   // should pass through MaskOverreadsQ first.
404   __msan_check_mem_is_initialized(&src.val[0], sizeof(src.val[0]));
405   __msan_check_mem_is_initialized(&src.val[1], sizeof(src.val[1]));
406   __msan_check_mem_is_initialized(&src.val[2], sizeof(src.val[2]));
407   __msan_check_mem_is_initialized(&src.val[3], sizeof(src.val[3]));
408   vst4q_s16(static_cast<int16_t*>(buf), src);
409   __msan_unpoison(buf, sizeof(int16x8x4_t));
410 #else
411   vst4q_s16(static_cast<int16_t*>(buf), src);
412 #endif  // LIBGAV1_MSAN
413 }
414 
415 //------------------------------------------------------------------------------
416 // Pointer helpers.
417 
418 // This function adds |stride|, given as a number of bytes, to a pointer to a
419 // larger type, using native pointer arithmetic.
420 template <typename T>
AddByteStride(T * ptr,const ptrdiff_t stride)421 inline T* AddByteStride(T* ptr, const ptrdiff_t stride) {
422   return reinterpret_cast<T*>(
423       const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(ptr) + stride));
424 }
425 
426 //------------------------------------------------------------------------------
427 // Multiply.
428 
429 // Shim vmull_high_u16 for armv7.
VMullHighU16(const uint16x8_t a,const uint16x8_t b)430 inline uint32x4_t VMullHighU16(const uint16x8_t a, const uint16x8_t b) {
431 #if defined(__aarch64__)
432   return vmull_high_u16(a, b);
433 #else
434   return vmull_u16(vget_high_u16(a), vget_high_u16(b));
435 #endif
436 }
437 
438 // Shim vmull_high_s16 for armv7.
VMullHighS16(const int16x8_t a,const int16x8_t b)439 inline int32x4_t VMullHighS16(const int16x8_t a, const int16x8_t b) {
440 #if defined(__aarch64__)
441   return vmull_high_s16(a, b);
442 #else
443   return vmull_s16(vget_high_s16(a), vget_high_s16(b));
444 #endif
445 }
446 
447 // Shim vmlal_high_u16 for armv7.
VMlalHighU16(const uint32x4_t a,const uint16x8_t b,const uint16x8_t c)448 inline uint32x4_t VMlalHighU16(const uint32x4_t a, const uint16x8_t b,
449                                const uint16x8_t c) {
450 #if defined(__aarch64__)
451   return vmlal_high_u16(a, b, c);
452 #else
453   return vmlal_u16(a, vget_high_u16(b), vget_high_u16(c));
454 #endif
455 }
456 
457 // Shim vmlal_high_s16 for armv7.
VMlalHighS16(const int32x4_t a,const int16x8_t b,const int16x8_t c)458 inline int32x4_t VMlalHighS16(const int32x4_t a, const int16x8_t b,
459                               const int16x8_t c) {
460 #if defined(__aarch64__)
461   return vmlal_high_s16(a, b, c);
462 #else
463   return vmlal_s16(a, vget_high_s16(b), vget_high_s16(c));
464 #endif
465 }
466 
467 // Shim vmul_laneq_u16 for armv7.
468 template <int lane>
VMulLaneQU16(const uint16x4_t a,const uint16x8_t b)469 inline uint16x4_t VMulLaneQU16(const uint16x4_t a, const uint16x8_t b) {
470 #if defined(__aarch64__)
471   return vmul_laneq_u16(a, b, lane);
472 #else
473   if (lane < 4) return vmul_lane_u16(a, vget_low_u16(b), lane & 0x3);
474   return vmul_lane_u16(a, vget_high_u16(b), (lane - 4) & 0x3);
475 #endif
476 }
477 
478 // Shim vmulq_laneq_u16 for armv7.
479 template <int lane>
VMulQLaneQU16(const uint16x8_t a,const uint16x8_t b)480 inline uint16x8_t VMulQLaneQU16(const uint16x8_t a, const uint16x8_t b) {
481 #if defined(__aarch64__)
482   return vmulq_laneq_u16(a, b, lane);
483 #else
484   if (lane < 4) return vmulq_lane_u16(a, vget_low_u16(b), lane & 0x3);
485   return vmulq_lane_u16(a, vget_high_u16(b), (lane - 4) & 0x3);
486 #endif
487 }
488 
489 // Shim vmla_laneq_u16 for armv7.
490 template <int lane>
VMlaLaneQU16(const uint16x4_t a,const uint16x4_t b,const uint16x8_t c)491 inline uint16x4_t VMlaLaneQU16(const uint16x4_t a, const uint16x4_t b,
492                                const uint16x8_t c) {
493 #if defined(__aarch64__)
494   return vmla_laneq_u16(a, b, c, lane);
495 #else
496   if (lane < 4) return vmla_lane_u16(a, b, vget_low_u16(c), lane & 0x3);
497   return vmla_lane_u16(a, b, vget_high_u16(c), (lane - 4) & 0x3);
498 #endif
499 }
500 
501 // Shim vmlaq_laneq_u16 for armv7.
502 template <int lane>
VMlaQLaneQU16(const uint16x8_t a,const uint16x8_t b,const uint16x8_t c)503 inline uint16x8_t VMlaQLaneQU16(const uint16x8_t a, const uint16x8_t b,
504                                 const uint16x8_t c) {
505 #if defined(__aarch64__)
506   return vmlaq_laneq_u16(a, b, c, lane);
507 #else
508   if (lane < 4) return vmlaq_lane_u16(a, b, vget_low_u16(c), lane & 0x3);
509   return vmlaq_lane_u16(a, b, vget_high_u16(c), (lane - 4) & 0x3);
510 #endif
511 }
512 
513 //------------------------------------------------------------------------------
514 // Bit manipulation.
515 
516 // vshXX_n_XX() requires an immediate.
517 template <int shift>
LeftShiftVector(const uint8x8_t vector)518 inline uint8x8_t LeftShiftVector(const uint8x8_t vector) {
519   return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vector), shift));
520 }
521 
522 template <int shift>
RightShiftVector(const uint8x8_t vector)523 inline uint8x8_t RightShiftVector(const uint8x8_t vector) {
524   return vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(vector), shift));
525 }
526 
527 template <int shift>
RightShiftVector(const int8x8_t vector)528 inline int8x8_t RightShiftVector(const int8x8_t vector) {
529   return vreinterpret_s8_u64(vshr_n_u64(vreinterpret_u64_s8(vector), shift));
530 }
531 
532 // Shim vqtbl1_u8 for armv7.
VQTbl1U8(const uint8x16_t a,const uint8x8_t index)533 inline uint8x8_t VQTbl1U8(const uint8x16_t a, const uint8x8_t index) {
534 #if defined(__aarch64__)
535   return vqtbl1_u8(a, index);
536 #else
537   const uint8x8x2_t b = {vget_low_u8(a), vget_high_u8(a)};
538   return vtbl2_u8(b, index);
539 #endif
540 }
541 
542 // Shim vqtbl2_u8 for armv7.
VQTbl2U8(const uint8x16x2_t a,const uint8x8_t index)543 inline uint8x8_t VQTbl2U8(const uint8x16x2_t a, const uint8x8_t index) {
544 #if defined(__aarch64__)
545   return vqtbl2_u8(a, index);
546 #else
547   const uint8x8x4_t b = {vget_low_u8(a.val[0]), vget_high_u8(a.val[0]),
548                          vget_low_u8(a.val[1]), vget_high_u8(a.val[1])};
549   return vtbl4_u8(b, index);
550 #endif
551 }
552 
553 // Shim vqtbl2q_u8 for armv7.
VQTbl2QU8(const uint8x16x2_t a,const uint8x16_t index)554 inline uint8x16_t VQTbl2QU8(const uint8x16x2_t a, const uint8x16_t index) {
555 #if defined(__aarch64__)
556   return vqtbl2q_u8(a, index);
557 #else
558   return vcombine_u8(VQTbl2U8(a, vget_low_u8(index)),
559                      VQTbl2U8(a, vget_high_u8(index)));
560 #endif
561 }
562 
563 // Shim vqtbl3q_u8 for armv7.
VQTbl3U8(const uint8x16x3_t a,const uint8x8_t index)564 inline uint8x8_t VQTbl3U8(const uint8x16x3_t a, const uint8x8_t index) {
565 #if defined(__aarch64__)
566   return vqtbl3_u8(a, index);
567 #else
568   const uint8x8x4_t b = {vget_low_u8(a.val[0]), vget_high_u8(a.val[0]),
569                          vget_low_u8(a.val[1]), vget_high_u8(a.val[1])};
570   const uint8x8x2_t c = {vget_low_u8(a.val[2]), vget_high_u8(a.val[2])};
571   const uint8x8_t index_ext = vsub_u8(index, vdup_n_u8(32));
572   const uint8x8_t partial_lookup = vtbl4_u8(b, index);
573   return vtbx2_u8(partial_lookup, c, index_ext);
574 #endif
575 }
576 
577 // Shim vqtbl3q_u8 for armv7.
VQTbl3QU8(const uint8x16x3_t a,const uint8x16_t index)578 inline uint8x16_t VQTbl3QU8(const uint8x16x3_t a, const uint8x16_t index) {
579 #if defined(__aarch64__)
580   return vqtbl3q_u8(a, index);
581 #else
582   return vcombine_u8(VQTbl3U8(a, vget_low_u8(index)),
583                      VQTbl3U8(a, vget_high_u8(index)));
584 #endif
585 }
586 
587 // Shim vqtbl1_s8 for armv7.
VQTbl1S8(const int8x16_t a,const uint8x8_t index)588 inline int8x8_t VQTbl1S8(const int8x16_t a, const uint8x8_t index) {
589 #if defined(__aarch64__)
590   return vqtbl1_s8(a, index);
591 #else
592   const int8x8x2_t b = {vget_low_s8(a), vget_high_s8(a)};
593   return vtbl2_s8(b, vreinterpret_s8_u8(index));
594 #endif
595 }
596 
597 //------------------------------------------------------------------------------
598 // Saturation helpers.
599 
Clip3S16(const int16x4_t val,const int16x4_t low,const int16x4_t high)600 inline int16x4_t Clip3S16(const int16x4_t val, const int16x4_t low,
601                           const int16x4_t high) {
602   return vmin_s16(vmax_s16(val, low), high);
603 }
604 
Clip3S16(const int16x8_t val,const int16x8_t low,const int16x8_t high)605 inline int16x8_t Clip3S16(const int16x8_t val, const int16x8_t low,
606                           const int16x8_t high) {
607   return vminq_s16(vmaxq_s16(val, low), high);
608 }
609 
ConvertToUnsignedPixelU16(const int16x8_t val,int bitdepth)610 inline uint16x8_t ConvertToUnsignedPixelU16(const int16x8_t val, int bitdepth) {
611   const int16x8_t low = vdupq_n_s16(0);
612   const uint16x8_t high = vdupq_n_u16((1 << bitdepth) - 1);
613 
614   return vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(val, low)), high);
615 }
616 
617 //------------------------------------------------------------------------------
618 // Interleave.
619 
620 // vzipN is exclusive to A64.
InterleaveLow8(const uint8x8_t a,const uint8x8_t b)621 inline uint8x8_t InterleaveLow8(const uint8x8_t a, const uint8x8_t b) {
622 #if defined(__aarch64__)
623   return vzip1_u8(a, b);
624 #else
625   // Discard |.val[1]|
626   return vzip_u8(a, b).val[0];
627 #endif
628 }
629 
InterleaveLow32(const uint8x8_t a,const uint8x8_t b)630 inline uint8x8_t InterleaveLow32(const uint8x8_t a, const uint8x8_t b) {
631 #if defined(__aarch64__)
632   return vreinterpret_u8_u32(
633       vzip1_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(b)));
634 #else
635   // Discard |.val[1]|
636   return vreinterpret_u8_u32(
637       vzip_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(b)).val[0]);
638 #endif
639 }
640 
InterleaveLow32(const int8x8_t a,const int8x8_t b)641 inline int8x8_t InterleaveLow32(const int8x8_t a, const int8x8_t b) {
642 #if defined(__aarch64__)
643   return vreinterpret_s8_u32(
644       vzip1_u32(vreinterpret_u32_s8(a), vreinterpret_u32_s8(b)));
645 #else
646   // Discard |.val[1]|
647   return vreinterpret_s8_u32(
648       vzip_u32(vreinterpret_u32_s8(a), vreinterpret_u32_s8(b)).val[0]);
649 #endif
650 }
651 
InterleaveHigh32(const uint8x8_t a,const uint8x8_t b)652 inline uint8x8_t InterleaveHigh32(const uint8x8_t a, const uint8x8_t b) {
653 #if defined(__aarch64__)
654   return vreinterpret_u8_u32(
655       vzip2_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(b)));
656 #else
657   // Discard |.val[0]|
658   return vreinterpret_u8_u32(
659       vzip_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(b)).val[1]);
660 #endif
661 }
662 
InterleaveHigh32(const int8x8_t a,const int8x8_t b)663 inline int8x8_t InterleaveHigh32(const int8x8_t a, const int8x8_t b) {
664 #if defined(__aarch64__)
665   return vreinterpret_s8_u32(
666       vzip2_u32(vreinterpret_u32_s8(a), vreinterpret_u32_s8(b)));
667 #else
668   // Discard |.val[0]|
669   return vreinterpret_s8_u32(
670       vzip_u32(vreinterpret_u32_s8(a), vreinterpret_u32_s8(b)).val[1]);
671 #endif
672 }
673 
674 //------------------------------------------------------------------------------
675 // Sum.
676 
SumVector(const uint8x8_t a)677 inline uint16_t SumVector(const uint8x8_t a) {
678 #if defined(__aarch64__)
679   return vaddlv_u8(a);
680 #else
681   const uint16x4_t c = vpaddl_u8(a);
682   const uint32x2_t d = vpaddl_u16(c);
683   const uint64x1_t e = vpaddl_u32(d);
684   return static_cast<uint16_t>(vget_lane_u64(e, 0));
685 #endif  // defined(__aarch64__)
686 }
687 
SumVector(const uint32x2_t a)688 inline uint32_t SumVector(const uint32x2_t a) {
689 #if defined(__aarch64__)
690   return vaddv_u32(a);
691 #else
692   const uint64x1_t b = vpaddl_u32(a);
693   return vget_lane_u32(vreinterpret_u32_u64(b), 0);
694 #endif  // defined(__aarch64__)
695 }
696 
SumVector(const uint32x4_t a)697 inline uint32_t SumVector(const uint32x4_t a) {
698 #if defined(__aarch64__)
699   return vaddvq_u32(a);
700 #else
701   const uint64x2_t b = vpaddlq_u32(a);
702   const uint64x1_t c = vadd_u64(vget_low_u64(b), vget_high_u64(b));
703   return static_cast<uint32_t>(vget_lane_u64(c, 0));
704 #endif
705 }
706 
707 //------------------------------------------------------------------------------
708 // Transpose.
709 
710 // Transpose 32 bit elements such that:
711 // a: 00 01
712 // b: 02 03
713 // returns
714 // val[0]: 00 02
715 // val[1]: 01 03
Interleave32(const uint8x8_t a,const uint8x8_t b)716 inline uint8x8x2_t Interleave32(const uint8x8_t a, const uint8x8_t b) {
717   const uint32x2_t a_32 = vreinterpret_u32_u8(a);
718   const uint32x2_t b_32 = vreinterpret_u32_u8(b);
719   const uint32x2x2_t c = vtrn_u32(a_32, b_32);
720   const uint8x8x2_t d = {vreinterpret_u8_u32(c.val[0]),
721                          vreinterpret_u8_u32(c.val[1])};
722   return d;
723 }
724 
725 // Swap high and low 32 bit elements.
Transpose32(const uint8x8_t a)726 inline uint8x8_t Transpose32(const uint8x8_t a) {
727   const uint32x2_t b = vrev64_u32(vreinterpret_u32_u8(a));
728   return vreinterpret_u8_u32(b);
729 }
730 
731 // Swap high and low halves.
Transpose64(const uint16x8_t a)732 inline uint16x8_t Transpose64(const uint16x8_t a) { return vextq_u16(a, a, 4); }
733 
734 // Implement vtrnq_s64().
735 // Input:
736 // a0: 00 01 02 03 04 05 06 07
737 // a1: 16 17 18 19 20 21 22 23
738 // Output:
739 // b0.val[0]: 00 01 02 03 16 17 18 19
740 // b0.val[1]: 04 05 06 07 20 21 22 23
VtrnqS64(const int32x4_t a0,const int32x4_t a1)741 inline int16x8x2_t VtrnqS64(const int32x4_t a0, const int32x4_t a1) {
742   int16x8x2_t b0;
743   b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
744                            vreinterpret_s16_s32(vget_low_s32(a1)));
745   b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
746                            vreinterpret_s16_s32(vget_high_s32(a1)));
747   return b0;
748 }
749 
VtrnqU64(const uint32x4_t a0,const uint32x4_t a1)750 inline uint16x8x2_t VtrnqU64(const uint32x4_t a0, const uint32x4_t a1) {
751   uint16x8x2_t b0;
752   b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
753                            vreinterpret_u16_u32(vget_low_u32(a1)));
754   b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
755                            vreinterpret_u16_u32(vget_high_u32(a1)));
756   return b0;
757 }
758 
759 // Input:
760 // 00 01 02 03
761 // 10 11 12 13
762 // 20 21 22 23
763 // 30 31 32 33
764 // Output:
765 // 00 10 20 30
766 // 01 11 21 31
767 // 02 12 22 32
768 // 03 13 23 33
Transpose4x4(uint16x4_t a[4])769 inline void Transpose4x4(uint16x4_t a[4]) {
770   // b:
771   // 00 10 02 12
772   // 01 11 03 13
773   const uint16x4x2_t b = vtrn_u16(a[0], a[1]);
774   // c:
775   // 20 30 22 32
776   // 21 31 23 33
777   const uint16x4x2_t c = vtrn_u16(a[2], a[3]);
778   // d:
779   // 00 10 20 30
780   // 02 12 22 32
781   const uint32x2x2_t d =
782       vtrn_u32(vreinterpret_u32_u16(b.val[0]), vreinterpret_u32_u16(c.val[0]));
783   // e:
784   // 01 11 21 31
785   // 03 13 23 33
786   const uint32x2x2_t e =
787       vtrn_u32(vreinterpret_u32_u16(b.val[1]), vreinterpret_u32_u16(c.val[1]));
788   a[0] = vreinterpret_u16_u32(d.val[0]);
789   a[1] = vreinterpret_u16_u32(e.val[0]);
790   a[2] = vreinterpret_u16_u32(d.val[1]);
791   a[3] = vreinterpret_u16_u32(e.val[1]);
792 }
793 
794 // Input:
795 // a: 00 01 02 03 10 11 12 13
796 // b: 20 21 22 23 30 31 32 33
797 // Output:
798 // Note that columns [1] and [2] are transposed.
799 // a: 00 10 20 30 02 12 22 32
800 // b: 01 11 21 31 03 13 23 33
Transpose4x4(uint8x8_t * a,uint8x8_t * b)801 inline void Transpose4x4(uint8x8_t* a, uint8x8_t* b) {
802   const uint16x4x2_t c =
803       vtrn_u16(vreinterpret_u16_u8(*a), vreinterpret_u16_u8(*b));
804   const uint32x2x2_t d =
805       vtrn_u32(vreinterpret_u32_u16(c.val[0]), vreinterpret_u32_u16(c.val[1]));
806   const uint8x8x2_t e =
807       vtrn_u8(vreinterpret_u8_u32(d.val[0]), vreinterpret_u8_u32(d.val[1]));
808   *a = e.val[0];
809   *b = e.val[1];
810 }
811 
812 // 4x8 Input:
813 // a[0]: 00 01 02 03 04 05 06 07
814 // a[1]: 10 11 12 13 14 15 16 17
815 // a[2]: 20 21 22 23 24 25 26 27
816 // a[3]: 30 31 32 33 34 35 36 37
817 // 8x4 Output:
818 // a[0]: 00 10 20 30 04 14 24 34
819 // a[1]: 01 11 21 31 05 15 25 35
820 // a[2]: 02 12 22 32 06 16 26 36
821 // a[3]: 03 13 23 33 07 17 27 37
Transpose4x8(uint16x8_t a[4])822 inline void Transpose4x8(uint16x8_t a[4]) {
823   // b0.val[0]: 00 10 02 12 04 14 06 16
824   // b0.val[1]: 01 11 03 13 05 15 07 17
825   // b1.val[0]: 20 30 22 32 24 34 26 36
826   // b1.val[1]: 21 31 23 33 25 35 27 37
827   const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
828   const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
829 
830   // c0.val[0]: 00 10 20 30 04 14 24 34
831   // c0.val[1]: 02 12 22 32 06 16 26 36
832   // c1.val[0]: 01 11 21 31 05 15 25 35
833   // c1.val[1]: 03 13 23 33 07 17 27 37
834   const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
835                                     vreinterpretq_u32_u16(b1.val[0]));
836   const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
837                                     vreinterpretq_u32_u16(b1.val[1]));
838 
839   a[0] = vreinterpretq_u16_u32(c0.val[0]);
840   a[1] = vreinterpretq_u16_u32(c1.val[0]);
841   a[2] = vreinterpretq_u16_u32(c0.val[1]);
842   a[3] = vreinterpretq_u16_u32(c1.val[1]);
843 }
844 
845 // Special transpose for loop filter.
846 // 4x8 Input:
847 // p_q:  p3 p2 p1 p0 q0 q1 q2 q3
848 // a[0]: 00 01 02 03 04 05 06 07
849 // a[1]: 10 11 12 13 14 15 16 17
850 // a[2]: 20 21 22 23 24 25 26 27
851 // a[3]: 30 31 32 33 34 35 36 37
852 // 8x4 Output:
853 // a[0]: 03 13 23 33 04 14 24 34  p0q0
854 // a[1]: 02 12 22 32 05 15 25 35  p1q1
855 // a[2]: 01 11 21 31 06 16 26 36  p2q2
856 // a[3]: 00 10 20 30 07 17 27 37  p3q3
857 // Direct reapplication of the function will reset the high halves, but
858 // reverse the low halves:
859 // p_q:  p0 p1 p2 p3 q0 q1 q2 q3
860 // a[0]: 33 32 31 30 04 05 06 07
861 // a[1]: 23 22 21 20 14 15 16 17
862 // a[2]: 13 12 11 10 24 25 26 27
863 // a[3]: 03 02 01 00 34 35 36 37
864 // Simply reordering the inputs (3, 2, 1, 0) will reset the low halves, but
865 // reverse the high halves.
866 // The standard Transpose4x8 will produce the same reversals, but with the
867 // order of the low halves also restored relative to the high halves. This is
868 // preferable because it puts all values from the same source row back together,
869 // but some post-processing is inevitable.
LoopFilterTranspose4x8(uint16x8_t a[4])870 inline void LoopFilterTranspose4x8(uint16x8_t a[4]) {
871   // b0.val[0]: 00 10 02 12 04 14 06 16
872   // b0.val[1]: 01 11 03 13 05 15 07 17
873   // b1.val[0]: 20 30 22 32 24 34 26 36
874   // b1.val[1]: 21 31 23 33 25 35 27 37
875   const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
876   const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
877 
878   // Reverse odd vectors to bring the appropriate items to the front of zips.
879   // b0.val[0]: 00 10 02 12 04 14 06 16
880   // r0       : 03 13 01 11 07 17 05 15
881   // b1.val[0]: 20 30 22 32 24 34 26 36
882   // r1       : 23 33 21 31 27 37 25 35
883   const uint32x4_t r0 = vrev64q_u32(vreinterpretq_u32_u16(b0.val[1]));
884   const uint32x4_t r1 = vrev64q_u32(vreinterpretq_u32_u16(b1.val[1]));
885 
886   // Zip to complete the halves.
887   // c0.val[0]: 00 10 20 30 02 12 22 32  p3p1
888   // c0.val[1]: 04 14 24 34 06 16 26 36  q0q2
889   // c1.val[0]: 03 13 23 33 01 11 21 31  p0p2
890   // c1.val[1]: 07 17 27 37 05 15 25 35  q3q1
891   const uint32x4x2_t c0 = vzipq_u32(vreinterpretq_u32_u16(b0.val[0]),
892                                     vreinterpretq_u32_u16(b1.val[0]));
893   const uint32x4x2_t c1 = vzipq_u32(r0, r1);
894 
895   // d0.val[0]: 00 10 20 30 07 17 27 37  p3q3
896   // d0.val[1]: 02 12 22 32 05 15 25 35  p1q1
897   // d1.val[0]: 03 13 23 33 04 14 24 34  p0q0
898   // d1.val[1]: 01 11 21 31 06 16 26 36  p2q2
899   const uint16x8x2_t d0 = VtrnqU64(c0.val[0], c1.val[1]);
900   // The third row of c comes first here to swap p2 with q0.
901   const uint16x8x2_t d1 = VtrnqU64(c1.val[0], c0.val[1]);
902 
903   // 8x4 Output:
904   // a[0]: 03 13 23 33 04 14 24 34  p0q0
905   // a[1]: 02 12 22 32 05 15 25 35  p1q1
906   // a[2]: 01 11 21 31 06 16 26 36  p2q2
907   // a[3]: 00 10 20 30 07 17 27 37  p3q3
908   a[0] = d1.val[0];  // p0q0
909   a[1] = d0.val[1];  // p1q1
910   a[2] = d1.val[1];  // p2q2
911   a[3] = d0.val[0];  // p3q3
912 }
913 
914 // Reversible if the x4 values are packed next to each other.
915 // x4 input / x8 output:
916 // a0: 00 01 02 03 40 41 42 43 44
917 // a1: 10 11 12 13 50 51 52 53 54
918 // a2: 20 21 22 23 60 61 62 63 64
919 // a3: 30 31 32 33 70 71 72 73 74
920 // x8 input / x4 output:
921 // a0: 00 10 20 30 40 50 60 70
922 // a1: 01 11 21 31 41 51 61 71
923 // a2: 02 12 22 32 42 52 62 72
924 // a3: 03 13 23 33 43 53 63 73
Transpose8x4(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3)925 inline void Transpose8x4(uint8x8_t* a0, uint8x8_t* a1, uint8x8_t* a2,
926                          uint8x8_t* a3) {
927   const uint8x8x2_t b0 = vtrn_u8(*a0, *a1);
928   const uint8x8x2_t b1 = vtrn_u8(*a2, *a3);
929 
930   const uint16x4x2_t c0 =
931       vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0]));
932   const uint16x4x2_t c1 =
933       vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1]));
934 
935   *a0 = vreinterpret_u8_u16(c0.val[0]);
936   *a1 = vreinterpret_u8_u16(c1.val[0]);
937   *a2 = vreinterpret_u8_u16(c0.val[1]);
938   *a3 = vreinterpret_u8_u16(c1.val[1]);
939 }
940 
941 // Input:
942 // a[0]: 00 01 02 03 04 05 06 07
943 // a[1]: 10 11 12 13 14 15 16 17
944 // a[2]: 20 21 22 23 24 25 26 27
945 // a[3]: 30 31 32 33 34 35 36 37
946 // a[4]: 40 41 42 43 44 45 46 47
947 // a[5]: 50 51 52 53 54 55 56 57
948 // a[6]: 60 61 62 63 64 65 66 67
949 // a[7]: 70 71 72 73 74 75 76 77
950 
951 // Output:
952 // a[0]: 00 10 20 30 40 50 60 70
953 // a[1]: 01 11 21 31 41 51 61 71
954 // a[2]: 02 12 22 32 42 52 62 72
955 // a[3]: 03 13 23 33 43 53 63 73
956 // a[4]: 04 14 24 34 44 54 64 74
957 // a[5]: 05 15 25 35 45 55 65 75
958 // a[6]: 06 16 26 36 46 56 66 76
959 // a[7]: 07 17 27 37 47 57 67 77
Transpose8x8(int8x8_t a[8])960 inline void Transpose8x8(int8x8_t a[8]) {
961   // Swap 8 bit elements. Goes from:
962   // a[0]: 00 01 02 03 04 05 06 07
963   // a[1]: 10 11 12 13 14 15 16 17
964   // a[2]: 20 21 22 23 24 25 26 27
965   // a[3]: 30 31 32 33 34 35 36 37
966   // a[4]: 40 41 42 43 44 45 46 47
967   // a[5]: 50 51 52 53 54 55 56 57
968   // a[6]: 60 61 62 63 64 65 66 67
969   // a[7]: 70 71 72 73 74 75 76 77
970   // to:
971   // b0.val[0]: 00 10 02 12 04 14 06 16  40 50 42 52 44 54 46 56
972   // b0.val[1]: 01 11 03 13 05 15 07 17  41 51 43 53 45 55 47 57
973   // b1.val[0]: 20 30 22 32 24 34 26 36  60 70 62 72 64 74 66 76
974   // b1.val[1]: 21 31 23 33 25 35 27 37  61 71 63 73 65 75 67 77
975   const int8x16x2_t b0 =
976       vtrnq_s8(vcombine_s8(a[0], a[4]), vcombine_s8(a[1], a[5]));
977   const int8x16x2_t b1 =
978       vtrnq_s8(vcombine_s8(a[2], a[6]), vcombine_s8(a[3], a[7]));
979 
980   // Swap 16 bit elements resulting in:
981   // c0.val[0]: 00 10 20 30 04 14 24 34  40 50 60 70 44 54 64 74
982   // c0.val[1]: 02 12 22 32 06 16 26 36  42 52 62 72 46 56 66 76
983   // c1.val[0]: 01 11 21 31 05 15 25 35  41 51 61 71 45 55 65 75
984   // c1.val[1]: 03 13 23 33 07 17 27 37  43 53 63 73 47 57 67 77
985   const int16x8x2_t c0 = vtrnq_s16(vreinterpretq_s16_s8(b0.val[0]),
986                                    vreinterpretq_s16_s8(b1.val[0]));
987   const int16x8x2_t c1 = vtrnq_s16(vreinterpretq_s16_s8(b0.val[1]),
988                                    vreinterpretq_s16_s8(b1.val[1]));
989 
990   // Unzip 32 bit elements resulting in:
991   // d0.val[0]: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
992   // d0.val[1]: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
993   // d1.val[0]: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
994   // d1.val[1]: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
995   const int32x4x2_t d0 = vuzpq_s32(vreinterpretq_s32_s16(c0.val[0]),
996                                    vreinterpretq_s32_s16(c1.val[0]));
997   const int32x4x2_t d1 = vuzpq_s32(vreinterpretq_s32_s16(c0.val[1]),
998                                    vreinterpretq_s32_s16(c1.val[1]));
999 
1000   a[0] = vreinterpret_s8_s32(vget_low_s32(d0.val[0]));
1001   a[1] = vreinterpret_s8_s32(vget_high_s32(d0.val[0]));
1002   a[2] = vreinterpret_s8_s32(vget_low_s32(d1.val[0]));
1003   a[3] = vreinterpret_s8_s32(vget_high_s32(d1.val[0]));
1004   a[4] = vreinterpret_s8_s32(vget_low_s32(d0.val[1]));
1005   a[5] = vreinterpret_s8_s32(vget_high_s32(d0.val[1]));
1006   a[6] = vreinterpret_s8_s32(vget_low_s32(d1.val[1]));
1007   a[7] = vreinterpret_s8_s32(vget_high_s32(d1.val[1]));
1008 }
1009 
1010 // Unsigned.
Transpose8x8(uint8x8_t a[8])1011 inline void Transpose8x8(uint8x8_t a[8]) {
1012   const uint8x16x2_t b0 =
1013       vtrnq_u8(vcombine_u8(a[0], a[4]), vcombine_u8(a[1], a[5]));
1014   const uint8x16x2_t b1 =
1015       vtrnq_u8(vcombine_u8(a[2], a[6]), vcombine_u8(a[3], a[7]));
1016 
1017   const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
1018                                     vreinterpretq_u16_u8(b1.val[0]));
1019   const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
1020                                     vreinterpretq_u16_u8(b1.val[1]));
1021 
1022   const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
1023                                     vreinterpretq_u32_u16(c1.val[0]));
1024   const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
1025                                     vreinterpretq_u32_u16(c1.val[1]));
1026 
1027   a[0] = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
1028   a[1] = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
1029   a[2] = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
1030   a[3] = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
1031   a[4] = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
1032   a[5] = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
1033   a[6] = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
1034   a[7] = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
1035 }
1036 
Transpose8x8(uint8x8_t in[8],uint8x16_t out[4])1037 inline void Transpose8x8(uint8x8_t in[8], uint8x16_t out[4]) {
1038   const uint8x16x2_t a0 =
1039       vtrnq_u8(vcombine_u8(in[0], in[4]), vcombine_u8(in[1], in[5]));
1040   const uint8x16x2_t a1 =
1041       vtrnq_u8(vcombine_u8(in[2], in[6]), vcombine_u8(in[3], in[7]));
1042 
1043   const uint16x8x2_t b0 = vtrnq_u16(vreinterpretq_u16_u8(a0.val[0]),
1044                                     vreinterpretq_u16_u8(a1.val[0]));
1045   const uint16x8x2_t b1 = vtrnq_u16(vreinterpretq_u16_u8(a0.val[1]),
1046                                     vreinterpretq_u16_u8(a1.val[1]));
1047 
1048   const uint32x4x2_t c0 = vuzpq_u32(vreinterpretq_u32_u16(b0.val[0]),
1049                                     vreinterpretq_u32_u16(b1.val[0]));
1050   const uint32x4x2_t c1 = vuzpq_u32(vreinterpretq_u32_u16(b0.val[1]),
1051                                     vreinterpretq_u32_u16(b1.val[1]));
1052 
1053   out[0] = vreinterpretq_u8_u32(c0.val[0]);
1054   out[1] = vreinterpretq_u8_u32(c1.val[0]);
1055   out[2] = vreinterpretq_u8_u32(c0.val[1]);
1056   out[3] = vreinterpretq_u8_u32(c1.val[1]);
1057 }
1058 
1059 // Input:
1060 // a[0]: 00 01 02 03 04 05 06 07
1061 // a[1]: 10 11 12 13 14 15 16 17
1062 // a[2]: 20 21 22 23 24 25 26 27
1063 // a[3]: 30 31 32 33 34 35 36 37
1064 // a[4]: 40 41 42 43 44 45 46 47
1065 // a[5]: 50 51 52 53 54 55 56 57
1066 // a[6]: 60 61 62 63 64 65 66 67
1067 // a[7]: 70 71 72 73 74 75 76 77
1068 
1069 // Output:
1070 // a[0]: 00 10 20 30 40 50 60 70
1071 // a[1]: 01 11 21 31 41 51 61 71
1072 // a[2]: 02 12 22 32 42 52 62 72
1073 // a[3]: 03 13 23 33 43 53 63 73
1074 // a[4]: 04 14 24 34 44 54 64 74
1075 // a[5]: 05 15 25 35 45 55 65 75
1076 // a[6]: 06 16 26 36 46 56 66 76
1077 // a[7]: 07 17 27 37 47 57 67 77
Transpose8x8(int16x8_t a[8])1078 inline void Transpose8x8(int16x8_t a[8]) {
1079   const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]);
1080   const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]);
1081   const int16x8x2_t b2 = vtrnq_s16(a[4], a[5]);
1082   const int16x8x2_t b3 = vtrnq_s16(a[6], a[7]);
1083 
1084   const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
1085                                    vreinterpretq_s32_s16(b1.val[0]));
1086   const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
1087                                    vreinterpretq_s32_s16(b1.val[1]));
1088   const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
1089                                    vreinterpretq_s32_s16(b3.val[0]));
1090   const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
1091                                    vreinterpretq_s32_s16(b3.val[1]));
1092 
1093   const int16x8x2_t d0 = VtrnqS64(c0.val[0], c2.val[0]);
1094   const int16x8x2_t d1 = VtrnqS64(c1.val[0], c3.val[0]);
1095   const int16x8x2_t d2 = VtrnqS64(c0.val[1], c2.val[1]);
1096   const int16x8x2_t d3 = VtrnqS64(c1.val[1], c3.val[1]);
1097 
1098   a[0] = d0.val[0];
1099   a[1] = d1.val[0];
1100   a[2] = d2.val[0];
1101   a[3] = d3.val[0];
1102   a[4] = d0.val[1];
1103   a[5] = d1.val[1];
1104   a[6] = d2.val[1];
1105   a[7] = d3.val[1];
1106 }
1107 
1108 // Unsigned.
Transpose8x8(uint16x8_t a[8])1109 inline void Transpose8x8(uint16x8_t a[8]) {
1110   const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
1111   const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
1112   const uint16x8x2_t b2 = vtrnq_u16(a[4], a[5]);
1113   const uint16x8x2_t b3 = vtrnq_u16(a[6], a[7]);
1114 
1115   const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
1116                                     vreinterpretq_u32_u16(b1.val[0]));
1117   const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
1118                                     vreinterpretq_u32_u16(b1.val[1]));
1119   const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]),
1120                                     vreinterpretq_u32_u16(b3.val[0]));
1121   const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]),
1122                                     vreinterpretq_u32_u16(b3.val[1]));
1123 
1124   const uint16x8x2_t d0 = VtrnqU64(c0.val[0], c2.val[0]);
1125   const uint16x8x2_t d1 = VtrnqU64(c1.val[0], c3.val[0]);
1126   const uint16x8x2_t d2 = VtrnqU64(c0.val[1], c2.val[1]);
1127   const uint16x8x2_t d3 = VtrnqU64(c1.val[1], c3.val[1]);
1128 
1129   a[0] = d0.val[0];
1130   a[1] = d1.val[0];
1131   a[2] = d2.val[0];
1132   a[3] = d3.val[0];
1133   a[4] = d0.val[1];
1134   a[5] = d1.val[1];
1135   a[6] = d2.val[1];
1136   a[7] = d3.val[1];
1137 }
1138 
1139 // Input:
1140 // a[0]: 00 01 02 03 04 05 06 07  80 81 82 83 84 85 86 87
1141 // a[1]: 10 11 12 13 14 15 16 17  90 91 92 93 94 95 96 97
1142 // a[2]: 20 21 22 23 24 25 26 27  a0 a1 a2 a3 a4 a5 a6 a7
1143 // a[3]: 30 31 32 33 34 35 36 37  b0 b1 b2 b3 b4 b5 b6 b7
1144 // a[4]: 40 41 42 43 44 45 46 47  c0 c1 c2 c3 c4 c5 c6 c7
1145 // a[5]: 50 51 52 53 54 55 56 57  d0 d1 d2 d3 d4 d5 d6 d7
1146 // a[6]: 60 61 62 63 64 65 66 67  e0 e1 e2 e3 e4 e5 e6 e7
1147 // a[7]: 70 71 72 73 74 75 76 77  f0 f1 f2 f3 f4 f5 f6 f7
1148 
1149 // Output:
1150 // a[0]: 00 10 20 30 40 50 60 70  80 90 a0 b0 c0 d0 e0 f0
1151 // a[1]: 01 11 21 31 41 51 61 71  81 91 a1 b1 c1 d1 e1 f1
1152 // a[2]: 02 12 22 32 42 52 62 72  82 92 a2 b2 c2 d2 e2 f2
1153 // a[3]: 03 13 23 33 43 53 63 73  83 93 a3 b3 c3 d3 e3 f3
1154 // a[4]: 04 14 24 34 44 54 64 74  84 94 a4 b4 c4 d4 e4 f4
1155 // a[5]: 05 15 25 35 45 55 65 75  85 95 a5 b5 c5 d5 e5 f5
1156 // a[6]: 06 16 26 36 46 56 66 76  86 96 a6 b6 c6 d6 e6 f6
1157 // a[7]: 07 17 27 37 47 57 67 77  87 97 a7 b7 c7 d7 e7 f7
Transpose8x16(uint8x16_t a[8])1158 inline void Transpose8x16(uint8x16_t a[8]) {
1159   // b0.val[0]: 00 10 02 12 04 14 06 16  80 90 82 92 84 94 86 96
1160   // b0.val[1]: 01 11 03 13 05 15 07 17  81 91 83 93 85 95 87 97
1161   // b1.val[0]: 20 30 22 32 24 34 26 36  a0 b0 a2 b2 a4 b4 a6 b6
1162   // b1.val[1]: 21 31 23 33 25 35 27 37  a1 b1 a3 b3 a5 b5 a7 b7
1163   // b2.val[0]: 40 50 42 52 44 54 46 56  c0 d0 c2 d2 c4 d4 c6 d6
1164   // b2.val[1]: 41 51 43 53 45 55 47 57  c1 d1 c3 d3 c5 d5 c7 d7
1165   // b3.val[0]: 60 70 62 72 64 74 66 76  e0 f0 e2 f2 e4 f4 e6 f6
1166   // b3.val[1]: 61 71 63 73 65 75 67 77  e1 f1 e3 f3 e5 f5 e7 f7
1167   const uint8x16x2_t b0 = vtrnq_u8(a[0], a[1]);
1168   const uint8x16x2_t b1 = vtrnq_u8(a[2], a[3]);
1169   const uint8x16x2_t b2 = vtrnq_u8(a[4], a[5]);
1170   const uint8x16x2_t b3 = vtrnq_u8(a[6], a[7]);
1171 
1172   // c0.val[0]: 00 10 20 30 04 14 24 34  80 90 a0 b0 84 94 a4 b4
1173   // c0.val[1]: 02 12 22 32 06 16 26 36  82 92 a2 b2 86 96 a6 b6
1174   // c1.val[0]: 01 11 21 31 05 15 25 35  81 91 a1 b1 85 95 a5 b5
1175   // c1.val[1]: 03 13 23 33 07 17 27 37  83 93 a3 b3 87 97 a7 b7
1176   // c2.val[0]: 40 50 60 70 44 54 64 74  c0 d0 e0 f0 c4 d4 e4 f4
1177   // c2.val[1]: 42 52 62 72 46 56 66 76  c2 d2 e2 f2 c6 d6 e6 f6
1178   // c3.val[0]: 41 51 61 71 45 55 65 75  c1 d1 e1 f1 c5 d5 e5 f5
1179   // c3.val[1]: 43 53 63 73 47 57 67 77  c3 d3 e3 f3 c7 d7 e7 f7
1180   const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
1181                                     vreinterpretq_u16_u8(b1.val[0]));
1182   const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
1183                                     vreinterpretq_u16_u8(b1.val[1]));
1184   const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]),
1185                                     vreinterpretq_u16_u8(b3.val[0]));
1186   const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]),
1187                                     vreinterpretq_u16_u8(b3.val[1]));
1188 
1189   // d0.val[0]: 00 10 20 30 40 50 60 70  80 90 a0 b0 c0 d0 e0 f0
1190   // d0.val[1]: 04 14 24 34 44 54 64 74  84 94 a4 b4 c4 d4 e4 f4
1191   // d1.val[0]: 01 11 21 31 41 51 61 71  81 91 a1 b1 c1 d1 e1 f1
1192   // d1.val[1]: 05 15 25 35 45 55 65 75  85 95 a5 b5 c5 d5 e5 f5
1193   // d2.val[0]: 02 12 22 32 42 52 62 72  82 92 a2 b2 c2 d2 e2 f2
1194   // d2.val[1]: 06 16 26 36 46 56 66 76  86 96 a6 b6 c6 d6 e6 f6
1195   // d3.val[0]: 03 13 23 33 43 53 63 73  83 93 a3 b3 c3 d3 e3 f3
1196   // d3.val[1]: 07 17 27 37 47 57 67 77  87 97 a7 b7 c7 d7 e7 f7
1197   const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
1198                                     vreinterpretq_u32_u16(c2.val[0]));
1199   const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]),
1200                                     vreinterpretq_u32_u16(c3.val[0]));
1201   const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
1202                                     vreinterpretq_u32_u16(c2.val[1]));
1203   const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]),
1204                                     vreinterpretq_u32_u16(c3.val[1]));
1205 
1206   a[0] = vreinterpretq_u8_u32(d0.val[0]);
1207   a[1] = vreinterpretq_u8_u32(d1.val[0]);
1208   a[2] = vreinterpretq_u8_u32(d2.val[0]);
1209   a[3] = vreinterpretq_u8_u32(d3.val[0]);
1210   a[4] = vreinterpretq_u8_u32(d0.val[1]);
1211   a[5] = vreinterpretq_u8_u32(d1.val[1]);
1212   a[6] = vreinterpretq_u8_u32(d2.val[1]);
1213   a[7] = vreinterpretq_u8_u32(d3.val[1]);
1214 }
1215 
ZeroExtend(const uint8x8_t in)1216 inline int16x8_t ZeroExtend(const uint8x8_t in) {
1217   return vreinterpretq_s16_u16(vmovl_u8(in));
1218 }
1219 
1220 }  // namespace dsp
1221 }  // namespace libgav1
1222 
1223 #endif  // LIBGAV1_ENABLE_NEON
1224 #endif  // LIBGAV1_SRC_DSP_ARM_COMMON_NEON_H_
1225