1 // Copyright 2019 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "base/android/orderfile/orderfile_instrumentation.h"
6
7 #include <time.h>
8 #include <unistd.h>
9
10 #include <atomic>
11 #include <cstdio>
12 #include <cstring>
13 #include <string>
14 #include <thread>
15 #include <vector>
16
17 #include "base/android/library_loader/anchor_functions.h"
18 #include "base/android/orderfile/orderfile_buildflags.h"
19 #include "base/files/file.h"
20 #include "base/format_macros.h"
21 #include "base/json/json_writer.h"
22 #include "base/logging.h"
23 #include "base/strings/stringprintf.h"
24 #include "base/values.h"
25 #include "build/build_config.h"
26
27 #if BUILDFLAG(DEVTOOLS_INSTRUMENTATION_DUMPING)
28 #include <sstream>
29
30 #include "base/command_line.h"
31 #include "base/time/time.h"
32 #include "base/trace_event/memory_dump_manager.h" // no-presubmit-check
33 #include "base/trace_event/memory_dump_provider.h" // no-presubmit-check
34 #endif // BUILDFLAG(DEVTOOLS_INSTRUMENTATION_DUMPING)
35
36 #if !BUILDFLAG(SUPPORTS_CODE_ORDERING)
37 #error Only supported on architectures supporting code ordering (arm/arm64).
38 #endif // !BUILDFLAG(SUPPORTS_CODE_ORDERING)
39
40 // Must be applied to all functions within this file.
41 #define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))
42 #define INLINE_AND_NO_INSTRUMENT_FUNCTION \
43 __attribute__((always_inline, no_instrument_function))
44
45 namespace base {
46 namespace android {
47 namespace orderfile {
48
49 namespace {
50
51 #if BUILDFLAG(DEVTOOLS_INSTRUMENTATION_DUMPING)
52 // This is defined in content/public/common/content_switches.h, which is not
53 // accessible in ::base.
54 constexpr const char kProcessTypeSwitch[] = "type";
55 #else
56 // Constant used for StartDelayedDump().
57 constexpr int kDelayInSeconds = 30;
58 #endif // BUILDFLAG(DEVTOOLS_INSTRUMENTATION_DUMPING)
59
60 constexpr size_t kMaxTextSizeInBytes = 1 << 27;
61 constexpr size_t kMaxElements = kMaxTextSizeInBytes / 4;
62 // Native code currently have ~850k symbols, hence recording up to 1M symbols
63 // can cover all possible callee symbols.
64 constexpr size_t kMaxReachedSymbols = 1 << 20;
65 // 3 callers are recorded per callee.
66 constexpr size_t kCallerBuckets = 3;
67 // The last bucket is to count for misses and callers from outside the
68 // native code bounds.
69 constexpr size_t kMissesBucketIndex = 3;
70 constexpr size_t kTotalBuckets = 4;
71
72 std::atomic<uint32_t> callee_map[kMaxElements];
73 static_assert(sizeof(callee_map) == 128 * (1 << 20), "");
74 // Contain caller offsets. 4 buckets of callers per callee where the
75 // last bucket is for misses.
76 std::atomic<uint32_t> g_caller_offset[kMaxReachedSymbols * kTotalBuckets];
77 static_assert(sizeof(g_caller_offset) == 16 * (1 << 20), "");
78 // Corresponding count of |g_caller_offset|.
79 std::atomic<uint32_t> g_caller_count[kMaxReachedSymbols * kTotalBuckets];
80 static_assert(sizeof(g_caller_count) == 16 * (1 << 20), "");
81 // Index for |g_caller_offset| and |g_caller_count|.
82 std::atomic<uint32_t> g_callers_index;
83 std::atomic<uint32_t> g_calls_count;
84 std::atomic<bool> g_disabled;
85
86 #if BUILDFLAG(DEVTOOLS_INSTRUMENTATION_DUMPING)
87 // Dump offsets when a memory dump is requested. Used only if
88 // switches::kDevtoolsInstrumentationDumping is set.
89 class OrderfileMemoryDumpHook : public base::trace_event::MemoryDumpProvider {
OnMemoryDump(const base::trace_event::MemoryDumpArgs & args,base::trace_event::ProcessMemoryDump * pmd)90 NO_INSTRUMENT_FUNCTION bool OnMemoryDump(
91 const base::trace_event::MemoryDumpArgs& args,
92 base::trace_event::ProcessMemoryDump* pmd) override {
93 if (!Disable())
94 return true; // A dump has already been started.
95
96 std::string process_type =
97 base::CommandLine::ForCurrentProcess()->GetSwitchValueASCII(
98 kProcessTypeSwitch);
99 if (process_type.empty())
100 process_type = "browser";
101
102 Dump(process_type);
103 return true; // If something goes awry, a fatal error will be created
104 // internally.
105 }
106 };
107 #endif // BUILDFLAG(DEVTOOLS_INSTRUMENTATION_DUMPING)
108
109 // This is not racy. It is guaranteed that any number of threads concurrently
110 // calling this function in any order, will always end up with the same count
111 // at the end. It returns |element|'s value before the increment.
112 INLINE_AND_NO_INSTRUMENT_FUNCTION uint32_t
AtomicIncrement(std::atomic<uint32_t> * element)113 AtomicIncrement(std::atomic<uint32_t>* element) {
114 return element->fetch_add(1, std::memory_order_relaxed);
115 }
116
117 // Increment the miss bucket for a callee. |index| is the first bucket of
118 // callers for this callee.
RecordMiss(size_t index)119 INLINE_AND_NO_INSTRUMENT_FUNCTION void RecordMiss(size_t index) {
120 AtomicIncrement(g_caller_count + index + kMissesBucketIndex);
121 }
122
123 // Increment the caller count if it has previously been registered.
124 // If it hasn't, search for an empty bucket and register the caller.
125 // Otherwise, return false.
126 // |index| is the first bucket to register callers for a certain callee.
RecordCaller(size_t index,size_t caller_offset)127 INLINE_AND_NO_INSTRUMENT_FUNCTION bool RecordCaller(size_t index,
128 size_t caller_offset) {
129 for (size_t i = index; i < index + kCallerBuckets; i++) {
130 auto offset = g_caller_offset[i].load(std::memory_order_relaxed);
131 // This check is racy, a write could have happened between the load and the
132 // check.
133 if (offset == caller_offset) {
134 // Caller already recorded, increment the count.
135 AtomicIncrement(g_caller_count + i);
136 return true;
137 }
138 }
139
140 for (size_t i = index; i < index + kCallerBuckets; i++) {
141 auto offset = g_caller_offset[i].load(std::memory_order_relaxed);
142 size_t expected = 0;
143 if (!offset) {
144 // This is not racy as the compare and exchange is done atomically.
145 // It is impossible to reset a bucket if it has already been set. It
146 // exchanges the value in |g_caller_offset[i]| with |caller_offset| if
147 // the value in |g_caller_offset[i] == expected|.
148 // Otherwise, returns false and set |expected = g_caller_offset[i]|.
149 if (g_caller_offset[i].compare_exchange_strong(
150 expected, caller_offset, std::memory_order_relaxed,
151 std::memory_order_relaxed)) {
152 AtomicIncrement(g_caller_count + i);
153 return true;
154 }
155 }
156 // This will decrease the chances that we miss something due to unseen
157 // changes made by another thread.
158 if (offset == caller_offset || expected == caller_offset) {
159 AtomicIncrement(g_caller_count + i);
160 return true;
161 }
162 }
163 return false;
164 }
165
166 template <bool for_testing>
RecordAddress(size_t callee_address,size_t caller_address)167 __attribute__((always_inline, no_instrument_function)) void RecordAddress(
168 size_t callee_address,
169 size_t caller_address) {
170 bool disabled = g_disabled.load(std::memory_order_relaxed);
171 if (disabled)
172 return;
173
174 const size_t start =
175 for_testing ? kStartOfTextForTesting : base::android::kStartOfText;
176 const size_t end =
177 for_testing ? kEndOfTextForTesting : base::android::kEndOfText;
178
179 if (UNLIKELY(callee_address < start || callee_address > end)) {
180 // Only the code in the native library is instrumented. Callees are expected
181 // to be within the native library bounds.
182 Disable();
183 ImmediateCrash();
184 }
185
186 size_t offset = callee_address - start;
187 static_assert(sizeof(int) == 4,
188 "Collection and processing code assumes that sizeof(int) == 4");
189
190 size_t offset_index = offset / 4;
191
192 if (UNLIKELY(offset_index >= kMaxElements))
193 return;
194
195 std::atomic<uint32_t>* element = callee_map + offset_index;
196 uint32_t callers_index = element->load(std::memory_order_relaxed);
197
198 // Racy check.
199 if (callers_index == 0) {
200 // Fragmentation is possible as we increment the |insertion_index| based on
201 // a racy check.
202 uint32_t insertion_index = AtomicIncrement(&g_callers_index) + 1;
203 if (UNLIKELY(insertion_index >= kMaxReachedSymbols))
204 return;
205
206 uint32_t expected = 0;
207 // Exchanges the value in |element| with |insertion_index| if the value in
208 // |element == expected|. Otherwise, set |expected = element|.
209 element->compare_exchange_strong(expected, insertion_index,
210 std::memory_order_relaxed,
211 std::memory_order_relaxed);
212 // If expected is set, then this callee has previously been seen and already
213 // has a corresponding index in the callers array.
214 callers_index = expected == 0 ? insertion_index : expected;
215 }
216
217 AtomicIncrement(&g_calls_count);
218 callers_index *= kTotalBuckets;
219 if (caller_address <= start || caller_address > end ||
220 !RecordCaller(callers_index, caller_address - start)) {
221 // Record as a Miss, if the caller is not within the bounds of the native
222 // code or there are no empty buckets to record one more caller for this
223 // callee.
224 RecordMiss(callers_index);
225 }
226 }
227
DumpToFile(const base::FilePath & path)228 NO_INSTRUMENT_FUNCTION bool DumpToFile(const base::FilePath& path) {
229 auto file =
230 base::File(path, base::File::FLAG_CREATE_ALWAYS | base::File::FLAG_WRITE);
231 if (!file.IsValid()) {
232 PLOG(ERROR) << "Could not open " << path;
233 return false;
234 }
235
236 if (g_callers_index == 0) {
237 LOG(ERROR) << "No entries to dump";
238 return false;
239 }
240
241 // This can get very large as it constructs the whole data structure in
242 // memory before dumping it to the file.
243 uint32_t total_calls_count = g_calls_count.load(std::memory_order_relaxed);
244 auto root = base::Value::Dict().Set(
245 "total_calls_count", base::StringPrintf("%" PRIu32, total_calls_count));
246 auto call_graph = base::Value::List();
247 for (size_t i = 0; i < kMaxElements; i++) {
248 auto caller_index =
249 callee_map[i].load(std::memory_order_relaxed) * kTotalBuckets;
250 if (!caller_index) {
251 // This callee was never called.
252 continue;
253 }
254
255 uint32_t callee_offset = i * 4;
256 auto callee_element =
257 base::Value::Dict()
258 .Set("index", base::StringPrintf("%" PRIuS, caller_index))
259 .Set("callee_offset",
260 base::StringPrintf("%" PRIu32, callee_offset));
261 std::string offset_str;
262 auto callers_list = base::Value::List();
263 for (size_t j = 0; j < kTotalBuckets; j++) {
264 uint32_t caller_offset =
265 g_caller_offset[caller_index + j].load(std::memory_order_relaxed);
266
267 // The last bucket is for misses or callers outside the native library,
268 // the caller_offset for this bucket is 0.
269 if (j != kMissesBucketIndex && !caller_offset)
270 continue;
271
272 uint32_t count =
273 g_caller_count[caller_index + j].load(std::memory_order_relaxed);
274 // The count can only be 0 for the misses bucket. Otherwise,
275 // if |caller_offset| is set then the count must be >= 1.
276 CHECK_EQ(count || j == kMissesBucketIndex, true);
277 if (!count) {
278 // No misses.
279 continue;
280 }
281
282 auto caller_count =
283 base::Value::Dict()
284 .Set("caller_offset",
285 base::StringPrintf("%" PRIu32, caller_offset))
286 .Set("count", base::StringPrintf("%" PRIu32, count));
287 callers_list.Append(std::move(caller_count));
288 }
289 callee_element.Set("caller_and_count", std::move(callers_list));
290 call_graph.Append(std::move(callee_element));
291 }
292
293 root.Set("call_graph", std::move(call_graph));
294 std::string output_js;
295 if (!JSONWriter::Write(root, &output_js)) {
296 LOG(FATAL) << "Error getting JSON string";
297 }
298 if (file.WriteAtCurrentPos(output_js.c_str(),
299 static_cast<int>(output_js.size())) < 0) {
300 // If the file could be opened, but writing has failed, it's likely that
301 // data was partially written. Producing incomplete profiling data would
302 // lead to a poorly performing orderfile, but might not be otherwised
303 // noticed. So we crash instead.
304 LOG(FATAL) << "Error writing profile data";
305 }
306 return true;
307 }
308
309 // Stops recording, and outputs the data to |path|.
StopAndDumpToFile(int pid,uint64_t start_ns_since_epoch,const std::string & tag)310 NO_INSTRUMENT_FUNCTION void StopAndDumpToFile(int pid,
311 uint64_t start_ns_since_epoch,
312 const std::string& tag) {
313 std::string tag_str;
314 if (!tag.empty())
315 tag_str = base::StringPrintf("%s-", tag.c_str());
316 auto path = base::StringPrintf(
317 "/data/local/tmp/chrome/orderfile/profile-hitmap-%s%d-%" PRIu64 ".txt",
318 tag_str.c_str(), pid, start_ns_since_epoch);
319
320 if (!DumpToFile(base::FilePath(path))) {
321 LOG(ERROR) << "Problem with dump (" << tag << ")";
322 }
323 }
324
325 } // namespace
326
327 // It is safe to call any function after |Disable()| has been called. No risk of
328 // infinite recursion.
Disable()329 NO_INSTRUMENT_FUNCTION bool Disable() {
330 bool disabled = g_disabled.exchange(true, std::memory_order_relaxed);
331 std::atomic_thread_fence(std::memory_order_seq_cst);
332 return !disabled;
333 }
334
StartDelayedDump()335 NO_INSTRUMENT_FUNCTION void StartDelayedDump() {
336 #if BUILDFLAG(DEVTOOLS_INSTRUMENTATION_DUMPING)
337 static auto* g_orderfile_memory_dump_hook = new OrderfileMemoryDumpHook();
338 base::trace_event::MemoryDumpManager::GetInstance()->RegisterDumpProvider(
339 g_orderfile_memory_dump_hook, "Orderfile", nullptr);
340 // Return, letting devtools tracing handle any dumping.
341 #else
342 // Using std::thread and not using base::TimeTicks() in order to to not call
343 // too many base:: symbols that would pollute the reached symbol dumps.
344 struct timespec ts;
345 if (clock_gettime(CLOCK_MONOTONIC, &ts))
346 PLOG(FATAL) << "clock_gettime.";
347 uint64_t start_ns_since_epoch =
348 static_cast<uint64_t>(ts.tv_sec) * 1000 * 1000 * 1000 + ts.tv_nsec;
349 int pid = getpid();
350 std::thread([pid, start_ns_since_epoch]() {
351 sleep(kDelayInSeconds);
352 if (Disable())
353 StopAndDumpToFile(pid, start_ns_since_epoch, "");
354 }).detach();
355 #endif // BUILDFLAG(DEVTOOLS_INSTRUMENTATION_DUMPING)
356 }
357
Dump(const std::string & tag)358 NO_INSTRUMENT_FUNCTION void Dump(const std::string& tag) {
359 // As profiling has been disabled, none of the uses of ::base symbols below
360 // will enter the symbol dump.
361 StopAndDumpToFile(
362 getpid(), (base::Time::Now() - base::Time::UnixEpoch()).InNanoseconds(),
363 tag);
364 }
365
ResetForTesting()366 NO_INSTRUMENT_FUNCTION void ResetForTesting() {
367 Disable();
368 memset(reinterpret_cast<uint32_t*>(callee_map), 0,
369 sizeof(uint32_t) * kMaxElements);
370 memset(reinterpret_cast<uint32_t*>(g_caller_offset), 0,
371 sizeof(uint32_t) * kMaxReachedSymbols * kTotalBuckets);
372 memset(reinterpret_cast<uint32_t*>(g_caller_count), 0,
373 sizeof(uint32_t) * kMaxReachedSymbols * kTotalBuckets);
374 g_callers_index = 0;
375 g_disabled = false;
376 }
377
RecordAddressForTesting(size_t callee_address,size_t caller_address)378 NO_INSTRUMENT_FUNCTION void RecordAddressForTesting(size_t callee_address,
379 size_t caller_address) {
380 return RecordAddress<true>(callee_address, caller_address);
381 }
382
383 // Returns a flattened vector where each callee is allocated 9 buckets.
384 // First bucket -> callee offset
385 // 8 buckets -> [caller offset, count, ...]
GetOrderedOffsetsForTesting()386 NO_INSTRUMENT_FUNCTION std::vector<size_t> GetOrderedOffsetsForTesting() {
387 std::vector<size_t> result;
388 for (size_t i = 0; i < kMaxElements; i++) {
389 auto caller_index =
390 callee_map[i].load(std::memory_order_relaxed) * kTotalBuckets;
391 if (!caller_index)
392 continue;
393
394 result.push_back(i * 4);
395 for (size_t j = 0; j < kTotalBuckets; j++) {
396 uint32_t count =
397 g_caller_count[caller_index + j].load(std::memory_order_relaxed);
398
399 uint32_t caller_offset =
400 g_caller_offset[caller_index + j].load(std::memory_order_relaxed);
401 result.push_back(caller_offset);
402 result.push_back(count);
403 }
404 }
405 return result;
406 }
407
408 } // namespace orderfile
409 } // namespace android
410 } // namespace base
411
412 extern "C" {
413
__cyg_profile_func_enter_bare()414 NO_INSTRUMENT_FUNCTION void __cyg_profile_func_enter_bare() {
415 base::android::orderfile::RecordAddress<false>(
416 reinterpret_cast<size_t>(__builtin_return_address(0)),
417 reinterpret_cast<size_t>(__builtin_return_address(1)));
418 }
419
420 } // extern "C"
421