1 #include <random>
2 #include <thread>
3
4 #include "../src/perf_counters.h"
5 #include "gtest/gtest.h"
6
7 #ifndef GTEST_SKIP
8 struct MsgHandler {
operator =MsgHandler9 void operator=(std::ostream&) {}
10 };
11 #define GTEST_SKIP() return MsgHandler() = std::cout
12 #endif
13
14 using benchmark::internal::PerfCounters;
15 using benchmark::internal::PerfCountersMeasurement;
16 using benchmark::internal::PerfCounterValues;
17
18 namespace {
19 const char kGenericPerfEvent1[] = "CYCLES";
20 const char kGenericPerfEvent2[] = "BRANCHES";
21 const char kGenericPerfEvent3[] = "INSTRUCTIONS";
22
TEST(PerfCountersTest,Init)23 TEST(PerfCountersTest, Init) {
24 EXPECT_EQ(PerfCounters::Initialize(), PerfCounters::kSupported);
25 }
26
TEST(PerfCountersTest,OneCounter)27 TEST(PerfCountersTest, OneCounter) {
28 if (!PerfCounters::kSupported) {
29 GTEST_SKIP() << "Performance counters not supported.\n";
30 }
31 EXPECT_TRUE(PerfCounters::Initialize());
32 EXPECT_EQ(PerfCounters::Create({kGenericPerfEvent1}).num_counters(), 1);
33 }
34
TEST(PerfCountersTest,NegativeTest)35 TEST(PerfCountersTest, NegativeTest) {
36 if (!PerfCounters::kSupported) {
37 EXPECT_FALSE(PerfCounters::Initialize());
38 return;
39 }
40 EXPECT_TRUE(PerfCounters::Initialize());
41 // Sanity checks
42 // Create() will always create a valid object, even if passed no or
43 // wrong arguments as the new behavior is to warn and drop unsupported
44 // counters
45 EXPECT_EQ(PerfCounters::Create({}).num_counters(), 0);
46 EXPECT_EQ(PerfCounters::Create({""}).num_counters(), 0);
47 EXPECT_EQ(PerfCounters::Create({"not a counter name"}).num_counters(), 0);
48 {
49 // Try sneaking in a bad egg to see if it is filtered out. The
50 // number of counters has to be two, not zero
51 auto counter =
52 PerfCounters::Create({kGenericPerfEvent2, "", kGenericPerfEvent1});
53 EXPECT_EQ(counter.num_counters(), 2);
54 EXPECT_EQ(counter.names(), std::vector<std::string>(
55 {kGenericPerfEvent2, kGenericPerfEvent1}));
56 }
57 {
58 // Try sneaking in an outrageous counter, like a fat finger mistake
59 auto counter = PerfCounters::Create(
60 {kGenericPerfEvent3, "not a counter name", kGenericPerfEvent1});
61 EXPECT_EQ(counter.num_counters(), 2);
62 EXPECT_EQ(counter.names(), std::vector<std::string>(
63 {kGenericPerfEvent3, kGenericPerfEvent1}));
64 }
65 {
66 // Finally try a golden input - it should like all them
67 EXPECT_EQ(PerfCounters::Create(
68 {kGenericPerfEvent1, kGenericPerfEvent2, kGenericPerfEvent3})
69 .num_counters(),
70 3);
71 }
72 {
73 // Add a bad apple in the end of the chain to check the edges
74 auto counter = PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2,
75 kGenericPerfEvent3,
76 "MISPREDICTED_BRANCH_RETIRED"});
77 EXPECT_EQ(counter.num_counters(), 3);
78 EXPECT_EQ(counter.names(),
79 std::vector<std::string>({kGenericPerfEvent1, kGenericPerfEvent2,
80 kGenericPerfEvent3}));
81 }
82 }
83
TEST(PerfCountersTest,Read1Counter)84 TEST(PerfCountersTest, Read1Counter) {
85 if (!PerfCounters::kSupported) {
86 GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
87 }
88 EXPECT_TRUE(PerfCounters::Initialize());
89 auto counters = PerfCounters::Create({kGenericPerfEvent1});
90 EXPECT_EQ(counters.num_counters(), 1);
91 PerfCounterValues values1(1);
92 EXPECT_TRUE(counters.Snapshot(&values1));
93 EXPECT_GT(values1[0], 0);
94 PerfCounterValues values2(1);
95 EXPECT_TRUE(counters.Snapshot(&values2));
96 EXPECT_GT(values2[0], 0);
97 EXPECT_GT(values2[0], values1[0]);
98 }
99
TEST(PerfCountersTest,Read2Counters)100 TEST(PerfCountersTest, Read2Counters) {
101 if (!PerfCounters::kSupported) {
102 GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
103 }
104 EXPECT_TRUE(PerfCounters::Initialize());
105 auto counters =
106 PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent2});
107 EXPECT_EQ(counters.num_counters(), 2);
108 PerfCounterValues values1(2);
109 EXPECT_TRUE(counters.Snapshot(&values1));
110 EXPECT_GT(values1[0], 0);
111 EXPECT_GT(values1[1], 0);
112 PerfCounterValues values2(2);
113 EXPECT_TRUE(counters.Snapshot(&values2));
114 EXPECT_GT(values2[0], 0);
115 EXPECT_GT(values2[1], 0);
116 }
117
TEST(PerfCountersTest,ReopenExistingCounters)118 TEST(PerfCountersTest, ReopenExistingCounters) {
119 // This test works in recent and old Intel hardware
120 // However we cannot make assumptions beyond 3 HW counters
121 if (!PerfCounters::kSupported) {
122 GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
123 }
124 EXPECT_TRUE(PerfCounters::Initialize());
125 std::vector<std::string> kMetrics({kGenericPerfEvent1});
126 std::vector<PerfCounters> counters(3);
127 for (auto& counter : counters) {
128 counter = PerfCounters::Create(kMetrics);
129 }
130 PerfCounterValues values(1);
131 EXPECT_TRUE(counters[0].Snapshot(&values));
132 EXPECT_TRUE(counters[1].Snapshot(&values));
133 EXPECT_TRUE(counters[2].Snapshot(&values));
134 }
135
TEST(PerfCountersTest,CreateExistingMeasurements)136 TEST(PerfCountersTest, CreateExistingMeasurements) {
137 // The test works (i.e. causes read to fail) for the assumptions
138 // about hardware capabilities (i.e. small number (3) hardware
139 // counters) at this date,
140 // the same as previous test ReopenExistingCounters.
141 if (!PerfCounters::kSupported) {
142 GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
143 }
144 EXPECT_TRUE(PerfCounters::Initialize());
145
146 // This means we will try 10 counters but we can only guarantee
147 // for sure at this time that only 3 will work. Perhaps in the future
148 // we could use libpfm to query for the hardware limits on this
149 // particular platform.
150 const int kMaxCounters = 10;
151 const int kMinValidCounters = 3;
152
153 // Let's use a ubiquitous counter that is guaranteed to work
154 // on all platforms
155 const std::vector<std::string> kMetrics{"cycles"};
156
157 // Cannot create a vector of actual objects because the
158 // copy constructor of PerfCounters is deleted - and so is
159 // implicitly deleted on PerfCountersMeasurement too
160 std::vector<std::unique_ptr<PerfCountersMeasurement>>
161 perf_counter_measurements;
162
163 perf_counter_measurements.reserve(kMaxCounters);
164 for (int j = 0; j < kMaxCounters; ++j) {
165 perf_counter_measurements.emplace_back(
166 new PerfCountersMeasurement(kMetrics));
167 }
168
169 std::vector<std::pair<std::string, double>> measurements;
170
171 // Start all counters together to see if they hold
172 int max_counters = kMaxCounters;
173 for (int i = 0; i < kMaxCounters; ++i) {
174 auto& counter(*perf_counter_measurements[i]);
175 EXPECT_EQ(counter.num_counters(), 1);
176 if (!counter.Start()) {
177 max_counters = i;
178 break;
179 };
180 }
181
182 ASSERT_GE(max_counters, kMinValidCounters);
183
184 // Start all together
185 for (int i = 0; i < max_counters; ++i) {
186 auto& counter(*perf_counter_measurements[i]);
187 EXPECT_TRUE(counter.Stop(measurements) || (i >= kMinValidCounters));
188 }
189
190 // Start/stop individually
191 for (int i = 0; i < max_counters; ++i) {
192 auto& counter(*perf_counter_measurements[i]);
193 measurements.clear();
194 counter.Start();
195 EXPECT_TRUE(counter.Stop(measurements) || (i >= kMinValidCounters));
196 }
197 }
198
199 // We try to do some meaningful work here but the compiler
200 // insists in optimizing away our loop so we had to add a
201 // no-optimize macro. In case it fails, we added some entropy
202 // to this pool as well.
203
do_work()204 BENCHMARK_DONT_OPTIMIZE size_t do_work() {
205 static std::mt19937 rd{std::random_device{}()};
206 static std::uniform_int_distribution<size_t> mrand(0, 10);
207 const size_t kNumLoops = 1000000;
208 size_t sum = 0;
209 for (size_t j = 0; j < kNumLoops; ++j) {
210 sum += mrand(rd);
211 }
212 benchmark::DoNotOptimize(sum);
213 return sum;
214 }
215
measure(size_t threadcount,PerfCounterValues * before,PerfCounterValues * after)216 void measure(size_t threadcount, PerfCounterValues* before,
217 PerfCounterValues* after) {
218 BM_CHECK_NE(before, nullptr);
219 BM_CHECK_NE(after, nullptr);
220 std::vector<std::thread> threads(threadcount);
221 auto work = [&]() { BM_CHECK(do_work() > 1000); };
222
223 // We need to first set up the counters, then start the threads, so the
224 // threads would inherit the counters. But later, we need to first destroy
225 // the thread pool (so all the work finishes), then measure the counters. So
226 // the scopes overlap, and we need to explicitly control the scope of the
227 // threadpool.
228 auto counters =
229 PerfCounters::Create({kGenericPerfEvent1, kGenericPerfEvent3});
230 for (auto& t : threads) t = std::thread(work);
231 counters.Snapshot(before);
232 for (auto& t : threads) t.join();
233 counters.Snapshot(after);
234 }
235
TEST(PerfCountersTest,MultiThreaded)236 TEST(PerfCountersTest, MultiThreaded) {
237 if (!PerfCounters::kSupported) {
238 GTEST_SKIP() << "Test skipped because libpfm is not supported.";
239 }
240 EXPECT_TRUE(PerfCounters::Initialize());
241 PerfCounterValues before(2);
242 PerfCounterValues after(2);
243
244 // Notice that this test will work even if we taskset it to a single CPU
245 // In this case the threads will run sequentially
246 // Start two threads and measure the number of combined cycles and
247 // instructions
248 measure(2, &before, &after);
249 std::vector<double> Elapsed2Threads{
250 static_cast<double>(after[0] - before[0]),
251 static_cast<double>(after[1] - before[1])};
252
253 // Start four threads and measure the number of combined cycles and
254 // instructions
255 measure(4, &before, &after);
256 std::vector<double> Elapsed4Threads{
257 static_cast<double>(after[0] - before[0]),
258 static_cast<double>(after[1] - before[1])};
259
260 // Some extra work will happen on the main thread - like joining the threads
261 // - so the ratio won't be quite 2.0, but very close.
262 EXPECT_GE(Elapsed4Threads[0], 1.9 * Elapsed2Threads[0]);
263 EXPECT_GE(Elapsed4Threads[1], 1.9 * Elapsed2Threads[1]);
264 }
265
TEST(PerfCountersTest,HardwareLimits)266 TEST(PerfCountersTest, HardwareLimits) {
267 // The test works (i.e. causes read to fail) for the assumptions
268 // about hardware capabilities (i.e. small number (3-4) hardware
269 // counters) at this date,
270 // the same as previous test ReopenExistingCounters.
271 if (!PerfCounters::kSupported) {
272 GTEST_SKIP() << "Test skipped because libpfm is not supported.\n";
273 }
274 EXPECT_TRUE(PerfCounters::Initialize());
275
276 // Taken straight from `perf list` on x86-64
277 // Got all hardware names since these are the problematic ones
278 std::vector<std::string> counter_names{"cycles", // leader
279 "instructions",
280 "branches",
281 "L1-dcache-loads",
282 "L1-dcache-load-misses",
283 "L1-dcache-prefetches",
284 "L1-icache-load-misses", // leader
285 "L1-icache-loads",
286 "branch-load-misses",
287 "branch-loads",
288 "dTLB-load-misses",
289 "dTLB-loads",
290 "iTLB-load-misses", // leader
291 "iTLB-loads",
292 "branch-instructions",
293 "branch-misses",
294 "cache-misses",
295 "cache-references",
296 "stalled-cycles-backend", // leader
297 "stalled-cycles-frontend"};
298
299 // In the off-chance that some of these values are not supported,
300 // we filter them out so the test will complete without failure
301 // albeit it might not actually test the grouping on that platform
302 std::vector<std::string> valid_names;
303 for (const std::string& name : counter_names) {
304 if (PerfCounters::IsCounterSupported(name)) {
305 valid_names.push_back(name);
306 }
307 }
308 PerfCountersMeasurement counter(valid_names);
309
310 std::vector<std::pair<std::string, double>> measurements;
311
312 counter.Start();
313 EXPECT_TRUE(counter.Stop(measurements));
314 }
315
316 } // namespace
317