1 // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 // common.h: contains stuff that's used throughout gemmlowp
16 // and should always be available.
17
18 #ifndef GEMMLOWP_INTERNAL_COMMON_H_
19 #define GEMMLOWP_INTERNAL_COMMON_H_
20
21 #include <pthread.h>
22
23 #include <algorithm>
24 #include <cassert>
25 #include <cmath>
26 #include <cstdlib>
27
28 #include "../profiling/instrumentation.h"
29
30 // Our inline assembly path assume GCC/Clang syntax.
31 // Native Client doesn't seem to support inline assembly(?).
32 #if defined(__GNUC__) && !defined(__native_client__)
33 #define GEMMLOWP_ALLOW_INLINE_ASM
34 #endif
35
36 // Define macro statement that avoids inlining for GCC.
37 // For non-GCC, define as empty macro.
38 #if defined(__GNUC__)
39 #define GEMMLOWP_NOINLINE __attribute__((noinline))
40 #else
41 #define GEMMLOWP_NOINLINE
42 #endif
43
44 // Detect ARM, 32-bit or 64-bit
45 #ifdef __arm__
46 #define GEMMLOWP_ARM_32
47 #endif
48
49 #ifdef __aarch64__
50 #define GEMMLOWP_ARM_64
51 #endif
52
53 #if defined(GEMMLOWP_ARM_32) || defined(GEMMLOWP_ARM_64)
54 #define GEMMLOWP_ARM
55 #endif
56
57 // Detect x86, 32-bit or 64-bit
58 #if defined(__i386__) || defined(_M_IX86) || defined(_X86_) || defined(__i386)
59 #define GEMMLOWP_X86_32
60 #endif
61
62 #if defined(__x86_64__) || defined(_M_X64) || defined(__amd64)
63 #define GEMMLOWP_X86_64
64 #endif
65
66 #if defined(GEMMLOWP_X86_32) || defined(GEMMLOWP_X86_64)
67 #define GEMMLOWP_X86
68 #endif
69
70 // Some of our optimized paths use inline assembly and for
71 // now we don't bother enabling some other optimized paths using intrinddics
72 // where we can't use inline assembly paths.
73 #ifdef GEMMLOWP_ALLOW_INLINE_ASM
74
75 // Detect NEON. It's important to check for both tokens.
76 #if (defined __ARM_NEON) || (defined __ARM_NEON__)
77 #define GEMMLOWP_NEON
78 #endif
79
80 // Convenience NEON tokens for 32-bit or 64-bit
81 #if defined(GEMMLOWP_NEON) && defined(GEMMLOWP_ARM_32)
82 #define GEMMLOWP_NEON_32
83 #endif
84
85 #if defined(GEMMLOWP_NEON) && defined(GEMMLOWP_ARM_64)
86 #define GEMMLOWP_NEON_64
87 #endif
88
89 // Detect SSE.
90 #ifdef __SSE4_1__
91 #define GEMMLOWP_SSE4
92 #endif
93
94 #ifdef __SSE3__
95 #define GEMMLOWP_SSE3
96 #endif
97
98 // Convenience SSE4 tokens for 32-bit or 64-bit
99 #if defined(GEMMLOWP_SSE4) && defined(GEMMLOWP_X86_32)
100 #define GEMMLOWP_SSE4_32
101 #endif
102
103 #if defined(GEMMLOWP_SSE3) && defined(GEMMLOWP_X86_32)
104 #define GEMMLOWP_SSE3_32
105 #endif
106
107 #if defined(GEMMLOWP_SSE4) && defined(GEMMLOWP_X86_64)
108 #define GEMMLOWP_SSE4_64
109 #endif
110
111 #if defined(GEMMLOWP_SSE3) && defined(GEMMLOWP_X86_64)
112 #define GEMMLOWP_SSE3_64
113 #endif
114
115 #endif // GEMMLOWP_ALLOW_INLINE_ASM
116
117 // Detect Android. Don't conflate with ARM - we care about tuning
118 // for non-ARM Android devices too. This can be used in conjunction
119 // with x86 to tune differently for mobile x86 CPUs (Atom) vs. desktop x86 CPUs.
120 #if defined(__ANDROID__)
121 #define GEMMLOWP_ANDROID
122 #endif
123
124 namespace gemmlowp {
125
126 // Standard cache line size. Useful to optimize alignment and
127 // prefetches. Ideally we would query this at runtime, however
128 // 64 byte cache lines are the vast majority, and even if it's
129 // wrong on some device, it will be wrong by no more than a 2x factor,
130 // which should be acceptable.
131 const int kDefaultCacheLineSize = 64;
132
133 // Default L1 and L2 data cache sizes.
134 // The L1 cache size is assumed to be for each core.
135 // The L2 cache size is assumed to be shared among all cores. What
136 // we call 'L2' here is effectively top-level cache.
137 //
138 // On x86, we should ideally query this at
139 // runtime. On ARM, the instruction to query this is privileged and
140 // Android kernels do not expose it to userspace. Fortunately, the majority
141 // of ARM devices have roughly comparable values:
142 // Nexus 5: L1 16k, L2 1M
143 // Android One: L1 32k, L2 512k
144 // The following values are equal to or somewhat lower than that, and were
145 // found to perform well on both the Nexus 5 and Android One.
146 // Of course, these values are in principle too low for typical x86 CPUs
147 // where we should set the L2 value to (L3 cache size / number of cores) at
148 // least.
149 //
150 #if defined(GEMMLOWP_ARM) && defined(__APPLE__)
151 // iPhone/iPad
152 const int kDefaultL1CacheSize = 48 * 1024;
153 const int kDefaultL2CacheSize = 2 * 1024 * 1024;
154 #elif defined(GEMMLOWP_ARM) || defined(GEMMLOWP_ANDROID)
155 // Other ARM or ARM-like hardware (Android implies ARM-like) so here it's OK
156 // to tune for ARM, although on x86 Atom we might be able to query
157 // cache sizes at runtime, which would be better.
158 const int kDefaultL1CacheSize = 16 * 1024;
159 const int kDefaultL2CacheSize = 384 * 1024;
160 #elif defined(GEMMLOWP_X86_64)
161 // x86-64 and not Android. Therefore, likely desktop-class x86 hardware.
162 // Thus we assume larger cache sizes, though we really should query
163 // them at runtime.
164 const int kDefaultL1CacheSize = 32 * 1024;
165 const int kDefaultL2CacheSize = 4 * 1024 * 1024;
166 #elif defined(GEMMLOWP_X86_32)
167 // x86-32 and not Android. Same as x86-64 but less bullish.
168 const int kDefaultL1CacheSize = 32 * 1024;
169 const int kDefaultL2CacheSize = 2 * 1024 * 1024;
170 #else
171 // Less common hardware. Maybe some unusual or older or embedded thing.
172 // Assume smaller caches, but don't depart too far from what we do
173 // on ARM/Android to avoid accidentally exposing unexpected behavior.
174 const int kDefaultL1CacheSize = 16 * 1024;
175 const int kDefaultL2CacheSize = 256 * 1024;
176 #endif
177
178 // The proportion of the cache that we intend to use for storing
179 // RHS blocks. This should be between 0 and 1, and typically closer to 1,
180 // as we typically want to use most of the L2 cache for storing a large
181 // RHS block.
182 #if defined(GEMMLOWP_X86)
183 // For IA, use the entire L2 cache for the RHS matrix. LHS matrix is not blocked
184 // for L2 cache.
185 const float kDefaultL2RhsFactor = 1.00f;
186 #else
187 const float kDefaultL2RhsFactor = 0.75f;
188 #endif
189
190 // The number of bytes in a SIMD register. This is used to determine
191 // the dimensions of PackingRegisterBlock so that such blocks can
192 // be efficiently loaded into registers, so that packing code can
193 // work within registers as much as possible.
194 // In the non-SIMD generic fallback code, this is just a generic array
195 // size, so any size would work there. Different platforms may set this
196 // to different values but must ensure that their own optimized packing paths
197 // are consistent with this value.
198 const int kRegisterSize = 16;
199
200 // Hints the CPU to prefetch the cache line containing ptr.
Prefetch(const void * ptr)201 inline void Prefetch(const void* ptr) {
202 #if defined GEMMLOWP_ARM_64 && defined GEMMLOWP_ALLOW_INLINE_ASM
203 // Aarch64 has very detailed prefetch instructions, that compilers
204 // can't know how to map __builtin_prefetch to, and as a result, don't,
205 // leaving __builtin_prefetch a no-op on this architecture.
206 // For our purposes, "pldl1keep" is usually what we want, meaning:
207 // "prefetch for load, into L1 cache, using each value multiple times".
208 asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) : );
209 #elif defined \
210 __GNUC__ // Clang and GCC define __GNUC__ and have __builtin_prefetch.
211 __builtin_prefetch(ptr);
212 #else
213 (void)ptr;
214 #endif
215 }
216
217 // Returns the runtime argument rounded down to the nearest multiple of
218 // the fixed Modulus.
219 template <unsigned Modulus, typename Integer>
RoundDown(Integer i)220 Integer RoundDown(Integer i) {
221 return i - (i % Modulus);
222 }
223
224 // Returns the runtime argument rounded up to the nearest multiple of
225 // the fixed Modulus.
226 template <unsigned Modulus, typename Integer>
RoundUp(Integer i)227 Integer RoundUp(Integer i) {
228 return RoundDown<Modulus>(i + Modulus - 1);
229 }
230
231 // Returns the quotient a / b rounded up ('ceil') to the nearest integer.
232 template <typename Integer>
CeilQuotient(Integer a,Integer b)233 Integer CeilQuotient(Integer a, Integer b) {
234 return (a + b - 1) / b;
235 }
236
237 // Returns the argument rounded up to the nearest power of two.
238 template <typename Integer>
RoundUpToPowerOfTwo(Integer n)239 Integer RoundUpToPowerOfTwo(Integer n) {
240 Integer i = n - 1;
241 i |= i >> 1;
242 i |= i >> 2;
243 i |= i >> 4;
244 i |= i >> 8;
245 i |= i >> 16;
246 return i + 1;
247 }
248
249 template <int N>
250 struct IsPowerOfTwo {
251 static const bool value = !(N & (N - 1));
252 };
253
254 } // namespace gemmlowp
255
256 #endif // GEMMLOWP_INTERNAL_COMMON_H_
257