• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 Google LLC. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "ruy/pmu.h"
17 
18 #include "ruy/check_macros.h"
19 
20 #ifdef __linux__
21 #include <asm/unistd.h>
22 #include <linux/perf_event.h>
23 #include <sys/ioctl.h>
24 #include <syscall.h>
25 #include <unistd.h>
26 
27 #include <cstdio>
28 #endif
29 
30 #include <algorithm>
31 #include <cstdint>
32 #include <cstdlib>
33 #include <cstring>
34 
35 namespace ruy {
36 
37 // Linux-specific. Not ARM-specific.
38 #ifdef __linux__
39 class PerfEvent {
40  public:
PerfEvent(std::uint32_t type,std::uint64_t config)41   PerfEvent(std::uint32_t type, std::uint64_t config) {
42     perf_event_attr pe;
43     memset(&pe, 0, sizeof(pe));
44     pe.size = sizeof(pe);
45     pe.type = type;
46     pe.config = config;
47     pe.disabled = 1;
48     pe.exclude_kernel = 1;
49     pe.exclude_hv = 1;
50     pe.inherit = 1;
51     fd_ = syscall(__NR_perf_event_open, &pe, 0, -1, -1, 0);
52     if (fd_ == -1) {
53       fprintf(stderr, "perf_event_open failed for config 0x%lx\n",
54               static_cast<unsigned long>(config));
55       // abort();
56     }
57   }
58 
~PerfEvent()59   ~PerfEvent() {
60     RUY_CHECK(!started_);
61     close(fd_);
62   }
63 
Start()64   void Start() {
65     RUY_CHECK(!started_);
66     started_ = true;
67     ioctl(fd_, PERF_EVENT_IOC_RESET, 0);
68     ioctl(fd_, PERF_EVENT_IOC_ENABLE, 0);
69     count_at_start_ = Read();
70   }
71 
Stop()72   void Stop() {
73     RUY_CHECK(started_);
74     started_ = false;
75     ioctl(fd_, PERF_EVENT_IOC_DISABLE, 0);
76     count_at_stop_ = Read();
77   }
78 
Count() const79   std::int64_t Count() const {
80     RUY_CHECK(!started_);
81     return count_at_stop_ - count_at_start_;
82   }
83 
84  private:
Read() const85   std::int64_t Read() const {
86     std::int64_t count;
87     RUY_CHECK_NE(read(fd_, &count, sizeof(count)), -1);
88     return count;
89   }
90   std::int64_t count_at_start_ = -1;
91   std::int64_t count_at_stop_ = -1;
92   bool started_ = false;
93   int fd_ = -1;
94 };
95 #else
96 // Placeholder implementation to at least compile outside of linux.
97 #define PERF_TYPE_RAW 0
98 class PerfEvent {
99  public:
100   PerfEvent(std::uint32_t, std::uint64_t) {}
101   ~PerfEvent() {}
102   void Start() {}
103   void Stop() {}
104   std::int64_t Count() const { return 0; }
105 };
106 #endif
107 
108 // ARM-specific. Query ARM PMU counters as Linux perf events using
109 // PERF_TYPE_RAW.
110 namespace arm_pmuv3 {
111 
112 #pragma GCC diagnostic push
113 #pragma GCC diagnostic ignored "-Wunused-const-variable"
114 
115 // These event numbers are listed in the ARMv8 architecture reference manual.
116 constexpr std::uint16_t L1I_CACHE_REFILL = 0x01;
117 constexpr std::uint16_t L1I_TLB_REFILL = 0x02;
118 constexpr std::uint16_t L1D_CACHE_REFILL = 0x03;
119 constexpr std::uint16_t L1D_CACHE = 0x04;
120 constexpr std::uint16_t L1D_TLB_REFILL = 0x05;
121 constexpr std::uint16_t LD_RETIRED = 0x06;
122 constexpr std::uint16_t ST_RETIRED = 0x07;
123 constexpr std::uint16_t INST_RETIRED = 0x08;
124 constexpr std::uint16_t EXC_TAKEN = 0x09;
125 constexpr std::uint16_t EXC_RETURN = 0x0A;
126 constexpr std::uint16_t CID_WRITE_RETIRED = 0x0B;
127 constexpr std::uint16_t PC_WRITE_RETIRED = 0x0C;
128 constexpr std::uint16_t BR_IMMED_RETIRED = 0x0D;
129 constexpr std::uint16_t BR_RETURN_RETIRED = 0x0E;
130 constexpr std::uint16_t UNALIGNED_LDST_RETIRED = 0x0F;
131 constexpr std::uint16_t BR_MIS_PRED = 0x10;
132 constexpr std::uint16_t CPU_CYCLES = 0x11;
133 constexpr std::uint16_t BR_PRED = 0x12;
134 constexpr std::uint16_t MEM_ACCESS = 0x13;
135 constexpr std::uint16_t L1I_CACHE = 0x14;
136 constexpr std::uint16_t L1D_CACHE_WB = 0x15;
137 constexpr std::uint16_t L2D_CACHE = 0x16;
138 constexpr std::uint16_t L2D_CACHE_REFILL = 0x17;
139 constexpr std::uint16_t L2D_CACHE_WB = 0x18;
140 constexpr std::uint16_t BUS_ACCESS = 0x19;
141 constexpr std::uint16_t MEMORY_ERROR = 0x1A;
142 constexpr std::uint16_t INST_SPEC = 0x1B;
143 constexpr std::uint16_t TTBR_WRITE_RETIRED = 0x1C;
144 constexpr std::uint16_t BUS_CYCLES = 0x1D;
145 constexpr std::uint16_t CHAIN = 0x1E;
146 constexpr std::uint16_t L1D_CACHE_ALLOCATE = 0x1F;
147 constexpr std::uint16_t L2D_CACHE_ALLOCATE = 0x20;
148 constexpr std::uint16_t BR_RETIRED = 0x21;
149 constexpr std::uint16_t BR_MIS_PRED_RETIRED = 0x22;
150 constexpr std::uint16_t STALL_FRONTEND = 0x23;
151 constexpr std::uint16_t STALL_BACKEND = 0x24;
152 constexpr std::uint16_t L1D_TLB = 0x25;
153 constexpr std::uint16_t L1I_TLB = 0x26;
154 constexpr std::uint16_t L2I_CACHE = 0x27;
155 constexpr std::uint16_t L2I_CACHE_REFILL = 0x28;
156 constexpr std::uint16_t L3D_CACHE_ALLOCATE = 0x29;
157 constexpr std::uint16_t L3D_CACHE_REFILL = 0x2A;
158 constexpr std::uint16_t L3D_CACHE = 0x2B;
159 constexpr std::uint16_t L3D_CACHE_WB = 0x2C;
160 constexpr std::uint16_t L2D_TLB_REFILL = 0x2D;
161 constexpr std::uint16_t L2I_TLB_REFILL = 0x2E;
162 constexpr std::uint16_t L2D_TLB = 0x2F;
163 constexpr std::uint16_t L2I_TLB = 0x30;
164 constexpr std::uint16_t LL_CACHE = 0x32;
165 constexpr std::uint16_t LL_CACHE_MISS = 0x33;
166 constexpr std::uint16_t DTLB_WALK = 0x34;
167 constexpr std::uint16_t LL_CACHE_RD = 0x36;
168 constexpr std::uint16_t LL_CACHE_MISS_RD = 0x37;
169 
170 // Additional implementation-defined events found by googling around.
171 constexpr std::uint16_t L1D_CACHE_RD = 0x40;
172 constexpr std::uint16_t L1D_CACHE_REFILL_RD = 0x42;
173 constexpr std::uint16_t L1D_TLB_REFILL_RD = 0x4C;
174 constexpr std::uint16_t L1D_TLB_RD = 0x4E;
175 constexpr std::uint16_t L2D_CACHE_RD = 0x50;
176 constexpr std::uint16_t L2D_CACHE_REFILL_RD = 0x52;
177 constexpr std::uint16_t BUS_ACCESS_RD = 0x60;
178 constexpr std::uint16_t MEM_ACCESS_RD = 0x66;
179 constexpr std::uint16_t L3D_CACHE_RD = 0xA0;
180 constexpr std::uint16_t L3D_CACHE_REFILL_RD = 0xA2;
181 
182 #pragma GCC diagnostic pop
183 
184 }  // namespace arm_pmuv3
185 
186 class PmuEventsPrivate {
187  public:
PmuEventsPrivate()188   PmuEventsPrivate()
189       : l1d_cache_refill(PERF_TYPE_RAW, arm_pmuv3::L1D_CACHE_REFILL),
190         l2d_cache_refill(PERF_TYPE_RAW, arm_pmuv3::L2D_CACHE_REFILL),
191         l3d_cache_refill(PERF_TYPE_RAW, arm_pmuv3::L3D_CACHE_REFILL),
192         ll_cache_miss(PERF_TYPE_RAW, arm_pmuv3::LL_CACHE_MISS),
193         l1d_tlb_refill(PERF_TYPE_RAW, arm_pmuv3::L1D_TLB_REFILL),
194         l2d_tlb_refill(PERF_TYPE_RAW, arm_pmuv3::L2D_TLB_REFILL),
195         stall_frontend(PERF_TYPE_RAW, arm_pmuv3::STALL_FRONTEND),
196         stall_backend(PERF_TYPE_RAW, arm_pmuv3::STALL_BACKEND),
197         br_mis_pred(PERF_TYPE_RAW, arm_pmuv3::BR_MIS_PRED),
198         l1d_cache_writeback(PERF_TYPE_RAW, arm_pmuv3::L1D_CACHE_WB),
199         l2d_cache_writeback(PERF_TYPE_RAW, arm_pmuv3::L2D_CACHE_WB) {}
200 
201  private:
202   friend class PmuEvents;
203   PerfEvent l1d_cache_refill;
204   PerfEvent l2d_cache_refill;
205   PerfEvent l3d_cache_refill;
206   PerfEvent ll_cache_miss;
207   PerfEvent l1d_tlb_refill;
208   PerfEvent l2d_tlb_refill;
209   PerfEvent stall_frontend;
210   PerfEvent stall_backend;
211   PerfEvent br_mis_pred;
212   PerfEvent l1d_cache_writeback;
213   PerfEvent l2d_cache_writeback;
214 };
215 
PmuEvents()216 PmuEvents::PmuEvents() : priv(new PmuEventsPrivate) {}
~PmuEvents()217 PmuEvents::~PmuEvents() { delete priv; }
218 
StartRecording()219 void PmuEvents::StartRecording() {
220   priv->l1d_cache_refill.Start();
221   priv->l2d_cache_refill.Start();
222   priv->l3d_cache_refill.Start();
223   priv->ll_cache_miss.Start();
224   priv->l1d_tlb_refill.Start();
225   priv->l2d_tlb_refill.Start();
226   priv->stall_frontend.Start();
227   priv->stall_backend.Start();
228   priv->br_mis_pred.Start();
229   priv->l1d_cache_writeback.Start();
230   priv->l2d_cache_writeback.Start();
231 }
232 
StopRecording()233 void PmuEvents::StopRecording() {
234   priv->l1d_cache_refill.Stop();
235   priv->l2d_cache_refill.Stop();
236   priv->l3d_cache_refill.Stop();
237   priv->ll_cache_miss.Stop();
238   priv->l1d_tlb_refill.Stop();
239   priv->l2d_tlb_refill.Stop();
240   priv->stall_frontend.Stop();
241   priv->stall_backend.Stop();
242   priv->br_mis_pred.Stop();
243   priv->l1d_cache_writeback.Stop();
244   priv->l2d_cache_writeback.Stop();
245 }
246 
BranchMispredictionCount() const247 float PmuEvents::BranchMispredictionCount() const {
248   return static_cast<float>(priv->br_mis_pred.Count());
249 }
250 
FrontendStallCount() const251 float PmuEvents::FrontendStallCount() const {
252   return static_cast<float>(priv->stall_frontend.Count());
253 }
254 
BackendStallCount() const255 float PmuEvents::BackendStallCount() const {
256   return static_cast<float>(priv->stall_backend.Count());
257 }
258 
L1RefillCount() const259 float PmuEvents::L1RefillCount() const {
260   return static_cast<float>(priv->l1d_cache_refill.Count());
261 }
262 
L2RefillCount() const263 float PmuEvents::L2RefillCount() const {
264   return static_cast<float>(priv->l2d_cache_refill.Count());
265 }
266 
L3RefillCount() const267 float PmuEvents::L3RefillCount() const {
268   // Important: this was discovered in the context of the above experiments,
269   // which also tested the _RD variants of these counters. So it's possible that
270   // it's just not needed here with the default (non _RD) counters.
271   //
272   // Some CPUs implement LL_CACHE_MISS[_RD], some implement
273   // L3D_CACHE_REFILL[_RD]. It seems that either one of these two counters is
274   // zero, or they roughly both agree with each other. Therefore, taking the max
275   // of them is a reasonable way to get something more portable across various
276   // CPUs.
277   return static_cast<float>(
278       std::max(priv->l3d_cache_refill.Count(), priv->ll_cache_miss.Count()));
279 }
280 
L1TLBRefillCount() const281 float PmuEvents::L1TLBRefillCount() const {
282   return static_cast<float>(priv->l1d_tlb_refill.Count());
283 }
284 
L2TLBRefillCount() const285 float PmuEvents::L2TLBRefillCount() const {
286   return static_cast<float>(priv->l2d_tlb_refill.Count());
287 }
288 
L1WritebackCount() const289 float PmuEvents::L1WritebackCount() const {
290   return static_cast<float>(priv->l1d_cache_writeback.Count());
291 }
292 
L2WritebackCount() const293 float PmuEvents::L2WritebackCount() const {
294   return static_cast<float>(priv->l2d_cache_writeback.Count());
295 }
296 
297 }  // namespace ruy
298