• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <pthread.h>
7 #include <sched.h>
8 #ifdef __ANDROID__
9   #include <malloc.h>
10 #endif
11 #if defined(__SSE__) || defined(__x86_64__)
12   #include <xmmintrin.h>
13 #endif
14 
15 #include <cstdio>
16 #include <cstdlib>
17 #include <cstring>
18 
19 #include <cpuinfo.h>
20 
21 #include "bench/utils.h"
22 
23 
24 static void* wipe_buffer = nullptr;
25 static size_t wipe_buffer_size = 0;
26 
27 static pthread_once_t wipe_buffer_guard = PTHREAD_ONCE_INIT;
28 
InitWipeBuffer()29 static void InitWipeBuffer() {
30   // Default: the largest know cache size (128 MB Intel Crystalwell L4 cache).
31   wipe_buffer_size = 128 * 1024 * 1024;
32   if (cpuinfo_initialize()) {
33     wipe_buffer_size = benchmark::utils::GetMaxCacheSize();
34   }
35 #if defined(__ANDROID__)
36   // memalign is obsolete, but it is the only option on Android until API level 17.
37   wipe_buffer = memalign(128, wipe_buffer_size);
38 #else
39   (void) posix_memalign((void**) &wipe_buffer, 128, wipe_buffer_size);
40 #endif
41   if (wipe_buffer != nullptr) {
42     memset(wipe_buffer, 0xA5, wipe_buffer_size);
43   }
44 }
45 
46 namespace benchmark {
47 namespace utils {
48 
PrefetchToL1(const void * ptr,size_t size)49 uint32_t PrefetchToL1(const void* ptr, size_t size) {
50   uint32_t step = 16;
51   if (cpuinfo_initialize()) {
52     step = cpuinfo_get_l1d_cache(0)->line_size;
53   }
54   const uint8_t* u8_ptr = static_cast<const uint8_t*>(ptr);
55   // Compute and return sum of data to prevent compiler from removing data reads.
56   uint32_t sum = 0;
57   while (size >= step) {
58     sum += uint32_t(*u8_ptr);
59     u8_ptr += step;
60     size -= step;
61   }
62   return sum;
63 }
64 
WipeCache()65 uint32_t WipeCache() {
66   pthread_once(&wipe_buffer_guard, &InitWipeBuffer);
67   return PrefetchToL1(wipe_buffer, wipe_buffer_size);
68 }
69 
DisableDenormals()70 void DisableDenormals() {
71 #if defined(__SSE__) || defined(__x86_64__)
72   _mm_setcsr(_mm_getcsr() | 0x8040);
73 #elif defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0)
74   uint32_t fpscr;
75   __asm__ __volatile__(
76       "VMRS %[fpscr], fpscr\n"
77       "ORR %[fpscr], #0x1000000\n"
78       "VMSR fpscr, %[fpscr]\n"
79     : [fpscr] "=r" (fpscr));
80 #elif defined(__aarch64__)
81   uint64_t fpcr;
82   __asm__ __volatile__(
83       "MRS %[fpcr], fpcr\n"
84       "ORR %w[fpcr], %w[fpcr], 0x1000000\n"
85       "ORR %w[fpcr], %w[fpcr], 0x80000\n"
86       "MSR fpcr, %[fpcr]\n"
87     : [fpcr] "=r" (fpcr));
88 #endif
89 }
90 
91 // Return clockrate in Hz
GetCurrentCpuFrequency()92 uint64_t GetCurrentCpuFrequency() {
93 #ifdef __linux__
94   int freq = 0;
95   char cpuinfo_name[512];
96   int cpu = sched_getcpu();
97   snprintf(cpuinfo_name, sizeof(cpuinfo_name),
98     "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_cur_freq", cpu);
99 
100   FILE* f = fopen(cpuinfo_name, "r");
101   if (f) {
102     if (fscanf(f, "%d", &freq)) {
103       fclose(f);
104       return uint64_t(freq) * 1000;
105     }
106     fclose(f);
107   }
108 #endif  // __linux__
109   return 0;
110 }
111 
GetMaxCacheSize()112 size_t GetMaxCacheSize() {
113   if (!cpuinfo_initialize()) {
114     #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
115       // DynamIQ max: 4 MB
116       return 4 * 1024 * 1024;
117     #else
118       // Intel eDRAM max: 128 MB
119       return 128 * 1024 * 1024;
120     #endif
121   }
122   const cpuinfo_processor* processor = cpuinfo_get_processor(0);
123   #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
124     // There is no precise way to detect cache size on ARM/ARM64, and cache size reported by cpuinfo
125     // may underestimate the actual cache size. Thus, we use microarchitecture-specific maximum.
126     switch (processor->core->uarch) {
127       case cpuinfo_uarch_xscale:
128       case cpuinfo_uarch_arm11:
129       case cpuinfo_uarch_scorpion:
130       case cpuinfo_uarch_krait:
131       case cpuinfo_uarch_kryo:
132       case cpuinfo_uarch_exynos_m1:
133       case cpuinfo_uarch_exynos_m2:
134       case cpuinfo_uarch_exynos_m3:
135         // cpuinfo-detected cache size always correct.
136         break;
137       case cpuinfo_uarch_cortex_a5:
138         // Max observed (NXP Vybrid SoC)
139         return 512 * 1024;
140       case cpuinfo_uarch_cortex_a7:
141         // Cortex-A7 MPCore Technical Reference Manual:
142         // 7.1. About the L2 Memory system
143         //   The L2 memory system consists of an:
144         //    - Optional tightly-coupled L2 cache that includes:
145         //      - Configurable L2 cache size of 128KB, 256KB, 512KB, and 1MB.
146         return 1024 * 1024;
147       case cpuinfo_uarch_cortex_a8:
148         // Cortex-A8 Technical Reference Manual:
149         // 8.1. About the L2 memory system
150         //   The key features of the L2 memory system include:
151         //    - configurable cache size of 0KB, 128KB, 256KB, 512KB, and 1MB
152         return 1024 * 1024;
153       case cpuinfo_uarch_cortex_a9:
154         // Max observed (e.g. Exynos 4212)
155         return 1024 * 1024;
156       case cpuinfo_uarch_cortex_a12:
157       case cpuinfo_uarch_cortex_a17:
158         // ARM Cortex-A17 MPCore Processor Technical Reference Manual:
159         // 7.1. About the L2 Memory system
160         //   The key features of the L2 memory system include:
161         //    - An integrated L2 cache:
162         //      - The cache size is implemented as either 256KB, 512KB, 1MB, 2MB, 4MB or 8MB.
163         return 8 * 1024 * 1024;
164       case cpuinfo_uarch_cortex_a15:
165         // ARM Cortex-A15 MPCore Processor Technical Reference Manual:
166         // 7.1. About the L2 memory system
167         //   The features of the L2 memory system include:
168         //    - Configurable L2 cache size of 512KB, 1MB, 2MB and 4MB.
169         return 4 * 1024 * 1024;
170       case cpuinfo_uarch_cortex_a35:
171         // ARM Cortex‑A35 Processor Technical Reference Manual:
172         // 7.1 About the L2 memory system
173         //   L2 cache
174         //    - Further features of the L2 cache are:
175         //      - Configurable size of 128KB, 256KB, 512KB, and 1MB.
176         return 1024 * 1024;
177       case cpuinfo_uarch_cortex_a53:
178         // ARM Cortex-A53 MPCore Processor Technical Reference Manual:
179         // 7.1. About the L2 memory system
180         //   The L2 memory system consists of an:
181         //    - Optional tightly-coupled L2 cache that includes:
182         //      - Configurable L2 cache size of 128KB, 256KB, 512KB, 1MB and 2MB.
183         return 2 * 1024 * 1024;
184       case cpuinfo_uarch_cortex_a57:
185         // ARM Cortex-A57 MPCore Processor Technical Reference Manual:
186         // 7.1 About the L2 memory system
187         //   The features of the L2 memory system include:
188         //    - Configurable L2 cache size of 512KB, 1MB, and 2MB.
189         return 2 * 1024 * 1024;
190       case cpuinfo_uarch_cortex_a72:
191         // ARM Cortex-A72 MPCore Processor Technical Reference Manual:
192         // 7.1 About the L2 memory system
193         //   The features of the L2 memory system include:
194         //    - Configurable L2 cache size of 512KB, 1MB, 2MB and 4MB.
195         return 4 * 1024 * 1024;
196       case cpuinfo_uarch_cortex_a73:
197         // ARM Cortex‑A73 MPCore Processor Technical Reference Manual
198         // 7.1 About the L2 memory system
199         //   The L2 memory system consists of:
200         //    - A tightly-integrated L2 cache with:
201         //       - A configurable size of 256KB, 512KB, 1MB, 2MB, 4MB, or 8MB.
202         return 8 * 1024 * 1024;
203       default:
204         // ARM DynamIQ Shared Unit Technical Reference Manual
205         // 1.3 Implementation options
206         //   L3_CACHE_SIZE
207         //    - 256KB
208         //    - 512KB
209         //    - 1024KB
210         //    - 1536KB
211         //    - 2048KB
212         //    - 3072KB
213         //    - 4096KB
214         return 4 * 1024 * 1024;
215     }
216   #endif
217   if (processor->cache.l4 != NULL) {
218     return processor->cache.l4->size;
219   } else if (processor->cache.l3 != NULL) {
220     return processor->cache.l3->size;
221   } else if (processor->cache.l2 != NULL) {
222     return processor->cache.l2->size;
223   } else if (processor->cache.l1d != NULL) {
224     return processor->cache.l1d->size;
225   } else {
226     return 0;
227   }
228 }
229 
MultiThreadingParameters(benchmark::internal::Benchmark * benchmark)230 void MultiThreadingParameters(benchmark::internal::Benchmark* benchmark) {
231   benchmark->ArgName("T");
232 
233   // Disabled thread pool (execution on the caller thread only).
234   benchmark->Arg(1);
235 
236   if (cpuinfo_initialize()) {
237     // All cores except the little ones.
238     uint32_t max_cores = cpuinfo_get_cores_count();
239     if (cpuinfo_get_clusters_count() > 1) {
240       max_cores -= cpuinfo_get_cluster(cpuinfo_get_clusters_count() - 1)->core_count;
241     }
242     for (uint32_t t = 2; t <= max_cores; t++) {
243       benchmark->Arg(t);
244     }
245 
246     // All cores (if more than one cluster).
247     if (cpuinfo_get_cores_count() > max_cores) {
248       benchmark->Arg(cpuinfo_get_cores_count());
249     }
250 
251     // All cores + hyperthreads (only if hyperthreading supported).
252     if (cpuinfo_get_processors_count() > cpuinfo_get_cores_count()) {
253       benchmark->Arg(cpuinfo_get_processors_count());
254     }
255   }
256 }
257 
258 
CheckNEON(benchmark::State & state)259 bool CheckNEON(benchmark::State& state) {
260   if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon()) {
261     state.SkipWithError("no NEON extension");
262     return false;
263   }
264   return true;
265 }
266 
CheckNEONFMA(benchmark::State & state)267 bool CheckNEONFMA(benchmark::State& state) {
268   if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon_fma()) {
269     state.SkipWithError("no NEON-FMA extension");
270     return false;
271   }
272   return true;
273 }
274 
CheckSSE41(benchmark::State & state)275 bool CheckSSE41(benchmark::State& state) {
276   if (!cpuinfo_initialize() || !cpuinfo_has_x86_sse4_1()) {
277     state.SkipWithError("no SSE4.1 extension");
278     return false;
279   }
280   return true;
281 }
282 
CheckAVX(benchmark::State & state)283 bool CheckAVX(benchmark::State& state) {
284   if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx()) {
285     state.SkipWithError("no AVX extension");
286     return false;
287   }
288   return true;
289 }
290 
CheckFMA3(benchmark::State & state)291 bool CheckFMA3(benchmark::State& state) {
292   if (!cpuinfo_initialize() || !cpuinfo_has_x86_fma3()) {
293     state.SkipWithError("no FMA3 extension");
294     return false;
295   }
296   return true;
297 }
298 
CheckAVX2(benchmark::State & state)299 bool CheckAVX2(benchmark::State& state) {
300   if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx2()) {
301     state.SkipWithError("no AVX2 extension");
302     return false;
303   }
304   return true;
305 }
306 
CheckAVX512F(benchmark::State & state)307 bool CheckAVX512F(benchmark::State& state) {
308   if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx512f()) {
309     state.SkipWithError("no AVX512F extension");
310     return false;
311   }
312   return true;
313 }
314 
315 }  // namespace utils
316 }  // namespace benchmark
317