1 /* Copyright 2019 Google LLC. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "ruy/pmu.h"
17
18 #include "ruy/check_macros.h"
19
20 #ifdef __linux__
21 #include <asm/unistd.h>
22 #include <linux/perf_event.h>
23 #include <sys/ioctl.h>
24 #include <syscall.h>
25 #include <unistd.h>
26
27 #include <cstdio>
28 #endif
29
30 #include <algorithm>
31 #include <cstdint>
32 #include <cstdlib>
33 #include <cstring>
34
35 namespace ruy {
36
37 // Linux-specific. Not ARM-specific.
38 #ifdef __linux__
39 class PerfEvent {
40 public:
PerfEvent(std::uint32_t type,std::uint64_t config)41 PerfEvent(std::uint32_t type, std::uint64_t config) {
42 perf_event_attr pe;
43 memset(&pe, 0, sizeof(pe));
44 pe.size = sizeof(pe);
45 pe.type = type;
46 pe.config = config;
47 pe.disabled = 1;
48 pe.exclude_kernel = 1;
49 pe.exclude_hv = 1;
50 pe.inherit = 1;
51 fd_ = syscall(__NR_perf_event_open, &pe, 0, -1, -1, 0);
52 if (fd_ == -1) {
53 fprintf(stderr, "perf_event_open failed for config 0x%lx\n",
54 static_cast<unsigned long>(config));
55 // abort();
56 }
57 }
58
~PerfEvent()59 ~PerfEvent() {
60 RUY_CHECK(!started_);
61 close(fd_);
62 }
63
Start()64 void Start() {
65 RUY_CHECK(!started_);
66 started_ = true;
67 ioctl(fd_, PERF_EVENT_IOC_RESET, 0);
68 ioctl(fd_, PERF_EVENT_IOC_ENABLE, 0);
69 count_at_start_ = Read();
70 }
71
Stop()72 void Stop() {
73 RUY_CHECK(started_);
74 started_ = false;
75 ioctl(fd_, PERF_EVENT_IOC_DISABLE, 0);
76 count_at_stop_ = Read();
77 }
78
Count() const79 std::int64_t Count() const {
80 RUY_CHECK(!started_);
81 return count_at_stop_ - count_at_start_;
82 }
83
84 private:
Read() const85 std::int64_t Read() const {
86 std::int64_t count;
87 RUY_CHECK_NE(read(fd_, &count, sizeof(count)), -1);
88 return count;
89 }
90 std::int64_t count_at_start_ = -1;
91 std::int64_t count_at_stop_ = -1;
92 bool started_ = false;
93 int fd_ = -1;
94 };
95 #else
96 // Placeholder implementation to at least compile outside of linux.
97 #define PERF_TYPE_RAW 0
98 class PerfEvent {
99 public:
100 PerfEvent(std::uint32_t, std::uint64_t) {}
101 ~PerfEvent() {}
102 void Start() {}
103 void Stop() {}
104 std::int64_t Count() const { return 0; }
105 };
106 #endif
107
108 // ARM-specific. Query ARM PMU counters as Linux perf events using
109 // PERF_TYPE_RAW.
110 namespace arm_pmuv3 {
111
112 #pragma GCC diagnostic push
113 #pragma GCC diagnostic ignored "-Wunused-const-variable"
114
115 // These event numbers are listed in the ARMv8 architecture reference manual.
116 constexpr std::uint16_t L1I_CACHE_REFILL = 0x01;
117 constexpr std::uint16_t L1I_TLB_REFILL = 0x02;
118 constexpr std::uint16_t L1D_CACHE_REFILL = 0x03;
119 constexpr std::uint16_t L1D_CACHE = 0x04;
120 constexpr std::uint16_t L1D_TLB_REFILL = 0x05;
121 constexpr std::uint16_t LD_RETIRED = 0x06;
122 constexpr std::uint16_t ST_RETIRED = 0x07;
123 constexpr std::uint16_t INST_RETIRED = 0x08;
124 constexpr std::uint16_t EXC_TAKEN = 0x09;
125 constexpr std::uint16_t EXC_RETURN = 0x0A;
126 constexpr std::uint16_t CID_WRITE_RETIRED = 0x0B;
127 constexpr std::uint16_t PC_WRITE_RETIRED = 0x0C;
128 constexpr std::uint16_t BR_IMMED_RETIRED = 0x0D;
129 constexpr std::uint16_t BR_RETURN_RETIRED = 0x0E;
130 constexpr std::uint16_t UNALIGNED_LDST_RETIRED = 0x0F;
131 constexpr std::uint16_t BR_MIS_PRED = 0x10;
132 constexpr std::uint16_t CPU_CYCLES = 0x11;
133 constexpr std::uint16_t BR_PRED = 0x12;
134 constexpr std::uint16_t MEM_ACCESS = 0x13;
135 constexpr std::uint16_t L1I_CACHE = 0x14;
136 constexpr std::uint16_t L1D_CACHE_WB = 0x15;
137 constexpr std::uint16_t L2D_CACHE = 0x16;
138 constexpr std::uint16_t L2D_CACHE_REFILL = 0x17;
139 constexpr std::uint16_t L2D_CACHE_WB = 0x18;
140 constexpr std::uint16_t BUS_ACCESS = 0x19;
141 constexpr std::uint16_t MEMORY_ERROR = 0x1A;
142 constexpr std::uint16_t INST_SPEC = 0x1B;
143 constexpr std::uint16_t TTBR_WRITE_RETIRED = 0x1C;
144 constexpr std::uint16_t BUS_CYCLES = 0x1D;
145 constexpr std::uint16_t CHAIN = 0x1E;
146 constexpr std::uint16_t L1D_CACHE_ALLOCATE = 0x1F;
147 constexpr std::uint16_t L2D_CACHE_ALLOCATE = 0x20;
148 constexpr std::uint16_t BR_RETIRED = 0x21;
149 constexpr std::uint16_t BR_MIS_PRED_RETIRED = 0x22;
150 constexpr std::uint16_t STALL_FRONTEND = 0x23;
151 constexpr std::uint16_t STALL_BACKEND = 0x24;
152 constexpr std::uint16_t L1D_TLB = 0x25;
153 constexpr std::uint16_t L1I_TLB = 0x26;
154 constexpr std::uint16_t L2I_CACHE = 0x27;
155 constexpr std::uint16_t L2I_CACHE_REFILL = 0x28;
156 constexpr std::uint16_t L3D_CACHE_ALLOCATE = 0x29;
157 constexpr std::uint16_t L3D_CACHE_REFILL = 0x2A;
158 constexpr std::uint16_t L3D_CACHE = 0x2B;
159 constexpr std::uint16_t L3D_CACHE_WB = 0x2C;
160 constexpr std::uint16_t L2D_TLB_REFILL = 0x2D;
161 constexpr std::uint16_t L2I_TLB_REFILL = 0x2E;
162 constexpr std::uint16_t L2D_TLB = 0x2F;
163 constexpr std::uint16_t L2I_TLB = 0x30;
164 constexpr std::uint16_t LL_CACHE = 0x32;
165 constexpr std::uint16_t LL_CACHE_MISS = 0x33;
166 constexpr std::uint16_t DTLB_WALK = 0x34;
167 constexpr std::uint16_t LL_CACHE_RD = 0x36;
168 constexpr std::uint16_t LL_CACHE_MISS_RD = 0x37;
169
170 // Additional implementation-defined events found by googling around.
171 constexpr std::uint16_t L1D_CACHE_RD = 0x40;
172 constexpr std::uint16_t L1D_CACHE_REFILL_RD = 0x42;
173 constexpr std::uint16_t L1D_TLB_REFILL_RD = 0x4C;
174 constexpr std::uint16_t L1D_TLB_RD = 0x4E;
175 constexpr std::uint16_t L2D_CACHE_RD = 0x50;
176 constexpr std::uint16_t L2D_CACHE_REFILL_RD = 0x52;
177 constexpr std::uint16_t BUS_ACCESS_RD = 0x60;
178 constexpr std::uint16_t MEM_ACCESS_RD = 0x66;
179 constexpr std::uint16_t L3D_CACHE_RD = 0xA0;
180 constexpr std::uint16_t L3D_CACHE_REFILL_RD = 0xA2;
181
182 #pragma GCC diagnostic pop
183
184 } // namespace arm_pmuv3
185
186 class PmuEventsPrivate {
187 public:
PmuEventsPrivate()188 PmuEventsPrivate()
189 : l1d_cache_refill(PERF_TYPE_RAW, arm_pmuv3::L1D_CACHE_REFILL),
190 l2d_cache_refill(PERF_TYPE_RAW, arm_pmuv3::L2D_CACHE_REFILL),
191 l3d_cache_refill(PERF_TYPE_RAW, arm_pmuv3::L3D_CACHE_REFILL),
192 ll_cache_miss(PERF_TYPE_RAW, arm_pmuv3::LL_CACHE_MISS),
193 l1d_tlb_refill(PERF_TYPE_RAW, arm_pmuv3::L1D_TLB_REFILL),
194 l2d_tlb_refill(PERF_TYPE_RAW, arm_pmuv3::L2D_TLB_REFILL),
195 stall_frontend(PERF_TYPE_RAW, arm_pmuv3::STALL_FRONTEND),
196 stall_backend(PERF_TYPE_RAW, arm_pmuv3::STALL_BACKEND),
197 br_mis_pred(PERF_TYPE_RAW, arm_pmuv3::BR_MIS_PRED),
198 l1d_cache_writeback(PERF_TYPE_RAW, arm_pmuv3::L1D_CACHE_WB),
199 l2d_cache_writeback(PERF_TYPE_RAW, arm_pmuv3::L2D_CACHE_WB) {}
200
201 private:
202 friend class PmuEvents;
203 PerfEvent l1d_cache_refill;
204 PerfEvent l2d_cache_refill;
205 PerfEvent l3d_cache_refill;
206 PerfEvent ll_cache_miss;
207 PerfEvent l1d_tlb_refill;
208 PerfEvent l2d_tlb_refill;
209 PerfEvent stall_frontend;
210 PerfEvent stall_backend;
211 PerfEvent br_mis_pred;
212 PerfEvent l1d_cache_writeback;
213 PerfEvent l2d_cache_writeback;
214 };
215
PmuEvents()216 PmuEvents::PmuEvents() : priv(new PmuEventsPrivate) {}
~PmuEvents()217 PmuEvents::~PmuEvents() { delete priv; }
218
StartRecording()219 void PmuEvents::StartRecording() {
220 priv->l1d_cache_refill.Start();
221 priv->l2d_cache_refill.Start();
222 priv->l3d_cache_refill.Start();
223 priv->ll_cache_miss.Start();
224 priv->l1d_tlb_refill.Start();
225 priv->l2d_tlb_refill.Start();
226 priv->stall_frontend.Start();
227 priv->stall_backend.Start();
228 priv->br_mis_pred.Start();
229 priv->l1d_cache_writeback.Start();
230 priv->l2d_cache_writeback.Start();
231 }
232
StopRecording()233 void PmuEvents::StopRecording() {
234 priv->l1d_cache_refill.Stop();
235 priv->l2d_cache_refill.Stop();
236 priv->l3d_cache_refill.Stop();
237 priv->ll_cache_miss.Stop();
238 priv->l1d_tlb_refill.Stop();
239 priv->l2d_tlb_refill.Stop();
240 priv->stall_frontend.Stop();
241 priv->stall_backend.Stop();
242 priv->br_mis_pred.Stop();
243 priv->l1d_cache_writeback.Stop();
244 priv->l2d_cache_writeback.Stop();
245 }
246
BranchMispredictionCount() const247 float PmuEvents::BranchMispredictionCount() const {
248 return static_cast<float>(priv->br_mis_pred.Count());
249 }
250
FrontendStallCount() const251 float PmuEvents::FrontendStallCount() const {
252 return static_cast<float>(priv->stall_frontend.Count());
253 }
254
BackendStallCount() const255 float PmuEvents::BackendStallCount() const {
256 return static_cast<float>(priv->stall_backend.Count());
257 }
258
L1RefillCount() const259 float PmuEvents::L1RefillCount() const {
260 return static_cast<float>(priv->l1d_cache_refill.Count());
261 }
262
L2RefillCount() const263 float PmuEvents::L2RefillCount() const {
264 return static_cast<float>(priv->l2d_cache_refill.Count());
265 }
266
L3RefillCount() const267 float PmuEvents::L3RefillCount() const {
268 // Important: this was discovered in the context of the above experiments,
269 // which also tested the _RD variants of these counters. So it's possible that
270 // it's just not needed here with the default (non _RD) counters.
271 //
272 // Some CPUs implement LL_CACHE_MISS[_RD], some implement
273 // L3D_CACHE_REFILL[_RD]. It seems that either one of these two counters is
274 // zero, or they roughly both agree with each other. Therefore, taking the max
275 // of them is a reasonable way to get something more portable across various
276 // CPUs.
277 return static_cast<float>(
278 std::max(priv->l3d_cache_refill.Count(), priv->ll_cache_miss.Count()));
279 }
280
L1TLBRefillCount() const281 float PmuEvents::L1TLBRefillCount() const {
282 return static_cast<float>(priv->l1d_tlb_refill.Count());
283 }
284
L2TLBRefillCount() const285 float PmuEvents::L2TLBRefillCount() const {
286 return static_cast<float>(priv->l2d_tlb_refill.Count());
287 }
288
L1WritebackCount() const289 float PmuEvents::L1WritebackCount() const {
290 return static_cast<float>(priv->l1d_cache_writeback.Count());
291 }
292
L2WritebackCount() const293 float PmuEvents::L2WritebackCount() const {
294 return static_cast<float>(priv->l2d_cache_writeback.Count());
295 }
296
297 } // namespace ruy
298