1 // Copyright 2014 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // NEON common code.
11
12 #ifndef WEBP_DSP_NEON_H_
13 #define WEBP_DSP_NEON_H_
14
15 #include <arm_neon.h>
16
17 #include "./dsp.h"
18
19 // Right now, some intrinsics functions seem slower, so we disable them
20 // everywhere except aarch64 where the inline assembly is incompatible.
21 #if defined(__aarch64__)
22 #define WEBP_USE_INTRINSICS // use intrinsics when possible
23 #endif
24
25 #define INIT_VECTOR2(v, a, b) do { \
26 v.val[0] = a; \
27 v.val[1] = b; \
28 } while (0)
29
30 #define INIT_VECTOR3(v, a, b, c) do { \
31 v.val[0] = a; \
32 v.val[1] = b; \
33 v.val[2] = c; \
34 } while (0)
35
36 #define INIT_VECTOR4(v, a, b, c, d) do { \
37 v.val[0] = a; \
38 v.val[1] = b; \
39 v.val[2] = c; \
40 v.val[3] = d; \
41 } while (0)
42
43 // if using intrinsics, this flag avoids some functions that make gcc-4.6.3
44 // crash ("internal compiler error: in immed_double_const, at emit-rtl.").
45 // (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183)
46 #if !(LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
47 #define WORK_AROUND_GCC
48 #endif
49
Transpose4x4(const int32x4x4_t rows)50 static WEBP_INLINE int32x4x4_t Transpose4x4(const int32x4x4_t rows) {
51 uint64x2x2_t row01, row23;
52
53 row01.val[0] = vreinterpretq_u64_s32(rows.val[0]);
54 row01.val[1] = vreinterpretq_u64_s32(rows.val[1]);
55 row23.val[0] = vreinterpretq_u64_s32(rows.val[2]);
56 row23.val[1] = vreinterpretq_u64_s32(rows.val[3]);
57 // Transpose 64-bit values (there's no vswp equivalent)
58 {
59 const uint64x1_t row0h = vget_high_u64(row01.val[0]);
60 const uint64x1_t row2l = vget_low_u64(row23.val[0]);
61 const uint64x1_t row1h = vget_high_u64(row01.val[1]);
62 const uint64x1_t row3l = vget_low_u64(row23.val[1]);
63 row01.val[0] = vcombine_u64(vget_low_u64(row01.val[0]), row2l);
64 row23.val[0] = vcombine_u64(row0h, vget_high_u64(row23.val[0]));
65 row01.val[1] = vcombine_u64(vget_low_u64(row01.val[1]), row3l);
66 row23.val[1] = vcombine_u64(row1h, vget_high_u64(row23.val[1]));
67 }
68 {
69 const int32x4x2_t out01 = vtrnq_s32(vreinterpretq_s32_u64(row01.val[0]),
70 vreinterpretq_s32_u64(row01.val[1]));
71 const int32x4x2_t out23 = vtrnq_s32(vreinterpretq_s32_u64(row23.val[0]),
72 vreinterpretq_s32_u64(row23.val[1]));
73 int32x4x4_t out;
74 out.val[0] = out01.val[0];
75 out.val[1] = out01.val[1];
76 out.val[2] = out23.val[0];
77 out.val[3] = out23.val[1];
78 return out;
79 }
80 }
81
82 #if 0 // Useful debug macro.
83 #include <stdio.h>
84 #define PRINT_REG(REG, SIZE) do { \
85 int i; \
86 printf("%s \t[%d]: 0x", #REG, SIZE); \
87 if (SIZE == 8) { \
88 uint8_t _tmp[8]; \
89 vst1_u8(_tmp, (REG)); \
90 for (i = 0; i < 8; ++i) printf("%.2x ", _tmp[i]); \
91 } else if (SIZE == 16) { \
92 uint16_t _tmp[4]; \
93 vst1_u16(_tmp, (REG)); \
94 for (i = 0; i < 4; ++i) printf("%.4x ", _tmp[i]); \
95 } \
96 printf("\n"); \
97 } while (0)
98 #endif
99
100 #endif // WEBP_DSP_NEON_H_
101