1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <cstdio>
7 #include <cstdlib>
8 #include <cstring>
9 #include <mutex>
10
11 #ifdef __linux__
12 #include <sched.h>
13 #endif
14 #if defined(__ANDROID__) || defined(_WIN32) || defined(__CYGWIN__)
15 #include <malloc.h>
16 #endif
17 #if defined(__SSE__) || defined(__x86_64__)
18 #include <xmmintrin.h>
19 #endif
20
21 #include <cpuinfo.h>
22
23 #include "bench/utils.h"
24
25
26 static void* wipe_buffer = nullptr;
27 static size_t wipe_buffer_size = 0;
28
29 static std::once_flag wipe_buffer_guard;
30
InitWipeBuffer()31 static void InitWipeBuffer() {
32 // Default: the largest know cache size (128 MB Intel Crystalwell L4 cache).
33 wipe_buffer_size = 128 * 1024 * 1024;
34 if (cpuinfo_initialize()) {
35 wipe_buffer_size = benchmark::utils::GetMaxCacheSize();
36 }
37 #if defined(_WIN32)
38 wipe_buffer = _aligned_malloc(wipe_buffer_size, 128);
39 #elif defined(__ANDROID__) || defined(__CYGWIN__)
40 // memalign is obsolete, but it is the only option on Android until API level 17.
41 wipe_buffer = memalign(128, wipe_buffer_size);
42 #else
43 (void) posix_memalign((void**) &wipe_buffer, 128, wipe_buffer_size);
44 #endif
45 if (wipe_buffer != nullptr) {
46 memset(wipe_buffer, 0xA5, wipe_buffer_size);
47 }
48 }
49
50 namespace benchmark {
51 namespace utils {
52
PrefetchToL1(const void * ptr,size_t size)53 uint32_t PrefetchToL1(const void* ptr, size_t size) {
54 uint32_t step = 16;
55 if (cpuinfo_initialize()) {
56 step = cpuinfo_get_l1d_cache(0)->line_size;
57 }
58 const uint8_t* u8_ptr = static_cast<const uint8_t*>(ptr);
59 // Compute and return sum of data to prevent compiler from removing data reads.
60 uint32_t sum = 0;
61 while (size >= step) {
62 sum += uint32_t(*u8_ptr);
63 u8_ptr += step;
64 size -= step;
65 }
66 return sum;
67 }
68
WipeCache()69 uint32_t WipeCache() {
70 std::call_once(wipe_buffer_guard, InitWipeBuffer);
71 return PrefetchToL1(wipe_buffer, wipe_buffer_size);
72 }
73
DisableDenormals()74 void DisableDenormals() {
75 #if defined(__SSE__) || defined(__x86_64__)
76 _mm_setcsr(_mm_getcsr() | 0x8040);
77 #elif defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0)
78 uint32_t fpscr;
79 #if defined(__thumb__) && !defined(__thumb2__)
80 __asm__ __volatile__(
81 "VMRS %[fpscr], fpscr\n"
82 "ORRS %[fpscr], %[bitmask]\n"
83 "VMSR fpscr, %[fpscr]\n"
84 : [fpscr] "=l" (fpscr)
85 : [bitmask] "l" (0x1000000)
86 : "cc");
87 #else
88 __asm__ __volatile__(
89 "VMRS %[fpscr], fpscr\n"
90 "ORR %[fpscr], #0x1000000\n"
91 "VMSR fpscr, %[fpscr]\n"
92 : [fpscr] "=r" (fpscr));
93 #endif
94 #elif defined(__aarch64__)
95 uint64_t fpcr;
96 __asm__ __volatile__(
97 "MRS %[fpcr], fpcr\n"
98 "ORR %w[fpcr], %w[fpcr], 0x1000000\n"
99 "ORR %w[fpcr], %w[fpcr], 0x80000\n"
100 "MSR fpcr, %[fpcr]\n"
101 : [fpcr] "=r" (fpcr));
102 #endif
103 }
104
105 // Return clockrate in Hz
GetCurrentCpuFrequency()106 uint64_t GetCurrentCpuFrequency() {
107 #ifdef __linux__
108 int freq = 0;
109 char cpuinfo_name[512];
110 int cpu = sched_getcpu();
111 snprintf(cpuinfo_name, sizeof(cpuinfo_name),
112 "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_cur_freq", cpu);
113
114 FILE* f = fopen(cpuinfo_name, "r");
115 if (f) {
116 if (fscanf(f, "%d", &freq)) {
117 fclose(f);
118 return uint64_t(freq) * 1000;
119 }
120 fclose(f);
121 }
122 #endif // __linux__
123 return 0;
124 }
125
GetMaxCacheSize()126 size_t GetMaxCacheSize() {
127 if (!cpuinfo_initialize()) {
128 #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
129 // DynamIQ max: 4 MB
130 return 4 * 1024 * 1024;
131 #else
132 // Intel eDRAM max: 128 MB
133 return 128 * 1024 * 1024;
134 #endif
135 }
136 return cpuinfo_get_max_cache_size();
137 }
138
MultiThreadingParameters(benchmark::internal::Benchmark * benchmark)139 void MultiThreadingParameters(benchmark::internal::Benchmark* benchmark) {
140 benchmark->ArgName("T");
141
142 // Disabled thread pool (execution on the caller thread only).
143 benchmark->Arg(1);
144
145 if (cpuinfo_initialize()) {
146 // All cores except the little ones.
147 uint32_t max_cores = cpuinfo_get_cores_count();
148 if (cpuinfo_get_clusters_count() > 1) {
149 max_cores -= cpuinfo_get_cluster(cpuinfo_get_clusters_count() - 1)->core_count;
150 }
151 for (uint32_t t = 2; t <= max_cores; t++) {
152 benchmark->Arg(t);
153 }
154
155 // All cores (if more than one cluster).
156 if (cpuinfo_get_cores_count() > max_cores) {
157 benchmark->Arg(cpuinfo_get_cores_count());
158 }
159
160 // All cores + hyperthreads (only if hyperthreading supported).
161 if (cpuinfo_get_processors_count() > cpuinfo_get_cores_count()) {
162 benchmark->Arg(cpuinfo_get_processors_count());
163 }
164 }
165 }
166
167
CheckVFP(benchmark::State & state)168 bool CheckVFP(benchmark::State& state) {
169 if (!cpuinfo_initialize() || !(cpuinfo_has_arm_vfpv2() || cpuinfo_has_arm_vfpv3())) {
170 state.SkipWithError("no VFP extension");
171 return false;
172 }
173 return true;
174 }
175
CheckNEONFP16ARITH(benchmark::State & state)176 bool CheckNEONFP16ARITH(benchmark::State& state) {
177 if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon_fp16_arith()) {
178 state.SkipWithError("no NEON-FP16-ARITH extension");
179 return false;
180 }
181 return true;
182 }
183
CheckNEON(benchmark::State & state)184 bool CheckNEON(benchmark::State& state) {
185 if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon()) {
186 state.SkipWithError("no NEON extension");
187 return false;
188 }
189 return true;
190 }
191
CheckNEONFMA(benchmark::State & state)192 bool CheckNEONFMA(benchmark::State& state) {
193 if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon_fma()) {
194 state.SkipWithError("no NEON-FMA extension");
195 return false;
196 }
197 return true;
198 }
199
CheckNEONDOT(benchmark::State & state)200 bool CheckNEONDOT(benchmark::State& state) {
201 if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon_dot()) {
202 state.SkipWithError("no NEON-DOT extension");
203 return false;
204 }
205 return true;
206 }
207
CheckSSSE3(benchmark::State & state)208 bool CheckSSSE3(benchmark::State& state) {
209 if (!cpuinfo_initialize() || !cpuinfo_has_x86_ssse3()) {
210 state.SkipWithError("no SSSE3 extension");
211 return false;
212 }
213 return true;
214 }
215
CheckSSE41(benchmark::State & state)216 bool CheckSSE41(benchmark::State& state) {
217 if (!cpuinfo_initialize() || !cpuinfo_has_x86_sse4_1()) {
218 state.SkipWithError("no SSE4.1 extension");
219 return false;
220 }
221 return true;
222 }
223
CheckAVX(benchmark::State & state)224 bool CheckAVX(benchmark::State& state) {
225 if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx()) {
226 state.SkipWithError("no AVX extension");
227 return false;
228 }
229 return true;
230 }
231
CheckXOP(benchmark::State & state)232 bool CheckXOP(benchmark::State& state) {
233 if (!cpuinfo_initialize() || !cpuinfo_has_x86_xop()) {
234 state.SkipWithError("no XOP extension");
235 return false;
236 }
237 return true;
238 }
239
CheckFMA3(benchmark::State & state)240 bool CheckFMA3(benchmark::State& state) {
241 if (!cpuinfo_initialize() || !cpuinfo_has_x86_fma3()) {
242 state.SkipWithError("no FMA3 extension");
243 return false;
244 }
245 return true;
246 }
247
CheckAVX2(benchmark::State & state)248 bool CheckAVX2(benchmark::State& state) {
249 if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx2()) {
250 state.SkipWithError("no AVX2 extension");
251 return false;
252 }
253 return true;
254 }
255
CheckAVX512F(benchmark::State & state)256 bool CheckAVX512F(benchmark::State& state) {
257 if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx512f()) {
258 state.SkipWithError("no AVX512F extension");
259 return false;
260 }
261 return true;
262 }
263
CheckAVX512SKX(benchmark::State & state)264 bool CheckAVX512SKX(benchmark::State& state) {
265 if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx512f() ||
266 !cpuinfo_has_x86_avx512cd() || !cpuinfo_has_x86_avx512bw() ||
267 !cpuinfo_has_x86_avx512dq() || !cpuinfo_has_x86_avx512vl())
268 {
269 state.SkipWithError("no AVX512 SKX extensions");
270 return false;
271 }
272 return true;
273 }
274
275 } // namespace utils
276 } // namespace benchmark
277