1 /*
2 * Copyright (c) 2020, The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #define LOG_TAG "carwatchdogd"
18 #define DEBUG false // STOPSHIP if true.
19
20 #include "WatchdogPerfService.h"
21
22 #include <android-base/file.h>
23 #include <android-base/parseint.h>
24 #include <android-base/stringprintf.h>
25 #include <android-base/strings.h>
26 #include <log/log.h>
27 #include <processgroup/sched_policy.h>
28
29 #include <pthread.h>
30
31 #include <iterator>
32 #include <vector>
33
34 namespace android {
35 namespace automotive {
36 namespace watchdog {
37
38 namespace {
39
40 using ::android::sp;
41 using ::android::String16;
42 using ::android::String8;
43 using ::android::automotive::watchdog::internal::PowerCycle;
44 using ::android::automotive::watchdog::internal::UserState;
45 using ::android::base::Error;
46 using ::android::base::Join;
47 using ::android::base::ParseUint;
48 using ::android::base::Result;
49 using ::android::base::Split;
50 using ::android::base::StringAppendF;
51 using ::android::base::StringPrintf;
52 using ::android::base::WriteStringToFd;
53
54 // Minimum required collection interval between subsequent collections.
55 const std::chrono::nanoseconds kMinEventInterval = 1s;
56 const std::chrono::seconds kDefaultSystemEventCollectionInterval = 1s;
57 const std::chrono::seconds kDefaultPeriodicCollectionInterval = 20s;
58 const std::chrono::seconds kDefaultPeriodicMonitorInterval = 5s;
59 const std::chrono::nanoseconds kCustomCollectionInterval = 10s;
60 const std::chrono::nanoseconds kCustomCollectionDuration = 30min;
61
62 constexpr const char* kServiceName = "WatchdogPerfService";
63 static const std::string kDumpMajorDelimiter = std::string(100, '-') + "\n"; // NOLINT
64 constexpr const char* kHelpText =
65 "\n%s dump options:\n"
66 "%s: Starts custom performance data collection. Customize the collection behavior with "
67 "the following optional arguments:\n"
68 "\t%s <seconds>: Modifies the collection interval. Default behavior is to collect once "
69 "every %lld seconds.\n"
70 "\t%s <seconds>: Modifies the maximum collection duration. Default behavior is to collect "
71 "until %ld minutes before automatically stopping the custom collection and discarding "
72 "the collected data.\n"
73 "\t%s <package name>,<package name>,...: Comma-separated value containing package names. "
74 "When provided, the results are filtered only to the provided package names. Default "
75 "behavior is to list the results for the top N packages.\n"
76 "%s: Stops custom performance data collection and generates a dump of "
77 "the collection report.\n\n"
78 "When no options are specified, the car watchdog report contains the performance data "
79 "collected during boot-time and over the last few minutes before the report generation.\n";
80
parseSecondsFlag(const Vector<String16> & args,size_t pos)81 Result<std::chrono::seconds> parseSecondsFlag(const Vector<String16>& args, size_t pos) {
82 if (args.size() <= pos) {
83 return Error() << "Value not provided";
84 }
85 uint64_t value;
86 if (std::string strValue = std::string(String8(args[pos]).string());
87 !ParseUint(strValue, &value)) {
88 return Error() << "Invalid value " << strValue << ", must be an integer";
89 }
90 return std::chrono::seconds(value);
91 }
92
toString(std::variant<EventType,SwitchMessage> what)93 constexpr const char* toString(std::variant<EventType, SwitchMessage> what) {
94 return std::visit(
95 [&](const auto& v) -> const char* {
96 switch (static_cast<int>(v)) {
97 case EventType::INIT:
98 return "INIT";
99 case EventType::TERMINATED:
100 return "TERMINATED";
101 case EventType::BOOT_TIME_COLLECTION:
102 return "BOOT_TIME_COLLECTION";
103 case EventType::PERIODIC_COLLECTION:
104 return "PERIODIC_COLLECTION";
105 case EventType::USER_SWITCH_COLLECTION:
106 return "USER_SWITCH_COLLECTION";
107 case EventType::WAKE_UP_COLLECTION:
108 return "WAKE_UP_COLLECTION";
109 case EventType::CUSTOM_COLLECTION:
110 return "CUSTOM_COLLECTION";
111 case EventType::PERIODIC_MONITOR:
112 return "PERIODIC_MONITOR";
113 case EventType::LAST_EVENT:
114 return "LAST_EVENT";
115 case SwitchMessage::END_BOOTTIME_COLLECTION:
116 return "END_BOOTTIME_COLLECTION";
117 case SwitchMessage::END_USER_SWITCH_COLLECTION:
118 return "END_USER_SWITCH_COLLECTION";
119 case SwitchMessage::END_WAKE_UP_COLLECTION:
120 return "END_WAKE_UP_COLLECTION";
121 case SwitchMessage::END_CUSTOM_COLLECTION:
122 return "END_CUSTOM_COLLECTION";
123 default:
124 return "INVALID_EVENT_OR_SWITCH_MESSAGE";
125 }
126 },
127 what);
128 }
129
toString(SystemState systemState)130 constexpr const char* toString(SystemState systemState) {
131 switch (systemState) {
132 case SystemState::NORMAL_MODE:
133 return "NORMAL_MODE";
134 case SystemState::GARAGE_MODE:
135 return "GARAGE_MODE";
136 default:
137 return "UNKNOWN MODE";
138 }
139 }
140
141 } // namespace
142
toString() const143 std::string WatchdogPerfService::EventMetadata::toString() const {
144 std::string buffer;
145 const auto intervalInSecs = std::chrono::duration_cast<std::chrono::seconds>(interval).count();
146 StringAppendF(&buffer, "Event interval: %lld second%s\n", intervalInSecs,
147 ((intervalInSecs > 1) ? "s" : ""));
148 if (!filterPackages.empty()) {
149 std::vector<std::string> packages(filterPackages.begin(), filterPackages.end());
150 StringAppendF(&buffer, "Filtered results to packages: %s\n", Join(packages, ", ").c_str());
151 }
152 return buffer;
153 }
154
registerDataProcessor(sp<DataProcessorInterface> processor)155 Result<void> WatchdogPerfService::registerDataProcessor(sp<DataProcessorInterface> processor) {
156 if (processor == nullptr) {
157 return Error() << "Must provide a valid data processor";
158 }
159 if (const auto result = processor->init(); !result.ok()) {
160 return Error() << "Failed to initialize " << processor->name().c_str() << ": "
161 << result.error().message();
162 }
163 Mutex::Autolock lock(mMutex);
164 mDataProcessors.push_back(processor);
165 if (DEBUG) {
166 ALOGD("Successfully registered %s to %s", processor->name().c_str(), kServiceName);
167 }
168 return {};
169 }
170
start()171 Result<void> WatchdogPerfService::start() {
172 {
173 Mutex::Autolock lock(mMutex);
174 if (mCurrCollectionEvent != EventType::INIT || mCollectionThread.joinable()) {
175 return Error(INVALID_OPERATION) << "Cannot start " << kServiceName << " more than once";
176 }
177 std::chrono::nanoseconds systemEventCollectionInterval =
178 std::chrono::duration_cast<std::chrono::nanoseconds>(
179 std::chrono::seconds(sysprop::systemEventCollectionInterval().value_or(
180 kDefaultSystemEventCollectionInterval.count())));
181 std::chrono::nanoseconds periodicCollectionInterval =
182 std::chrono::duration_cast<std::chrono::nanoseconds>(
183 std::chrono::seconds(sysprop::periodicCollectionInterval().value_or(
184 kDefaultPeriodicCollectionInterval.count())));
185 std::chrono::nanoseconds periodicMonitorInterval =
186 std::chrono::duration_cast<std::chrono::nanoseconds>(
187 std::chrono::seconds(sysprop::periodicMonitorInterval().value_or(
188 kDefaultPeriodicMonitorInterval.count())));
189 mBoottimeCollection = {
190 .eventType = EventType::BOOT_TIME_COLLECTION,
191 .interval = systemEventCollectionInterval,
192 .lastUptime = 0,
193 };
194 mPeriodicCollection = {
195 .eventType = EventType::PERIODIC_COLLECTION,
196 .interval = periodicCollectionInterval,
197 .lastUptime = 0,
198 };
199 mUserSwitchCollection = {{
200 .eventType = EventType::USER_SWITCH_COLLECTION,
201 .interval = systemEventCollectionInterval,
202 .lastUptime = 0,
203 }};
204 mWakeUpCollection = {
205 .eventType = EventType::WAKE_UP_COLLECTION,
206 .interval = systemEventCollectionInterval,
207 .lastUptime = 0,
208 };
209 mPeriodicMonitor = {
210 .eventType = EventType::PERIODIC_MONITOR,
211 .interval = periodicMonitorInterval,
212 .lastUptime = 0,
213 };
214 if (mDataProcessors.empty()) {
215 ALOGE("Terminating %s: No data processor is registered", kServiceName);
216 mCurrCollectionEvent = EventType::TERMINATED;
217 return Error() << "No data processor is registered";
218 }
219 mUidStatsCollector->init();
220 mProcStatCollector->init();
221 mProcDiskStatsCollector->init();
222 }
223
224 mCollectionThread = std::thread([&]() {
225 {
226 Mutex::Autolock lock(mMutex);
227 if (EventType expected = EventType::INIT; mCurrCollectionEvent != expected) {
228 ALOGE("Skipping performance data collection as the current collection event "
229 "%s != %s",
230 toString(mCurrCollectionEvent), toString(expected));
231 return;
232 }
233 notifySystemStartUpLocked();
234 mCurrCollectionEvent = EventType::BOOT_TIME_COLLECTION;
235 mBoottimeCollection.lastUptime = mHandlerLooper->now();
236 mHandlerLooper->setLooper(Looper::prepare(/*opts=*/0));
237 mHandlerLooper->sendMessage(sp<WatchdogPerfService>::fromExisting(this),
238 EventType::BOOT_TIME_COLLECTION);
239 }
240 if (set_sched_policy(0, SP_BACKGROUND) != 0) {
241 ALOGW("Failed to set background scheduling priority to %s thread", kServiceName);
242 }
243 if (int result = pthread_setname_np(pthread_self(), "WatchdogPerfSvc"); result != 0) {
244 ALOGE("Failed to set %s thread name: %d", kServiceName, result);
245 }
246 ALOGI("Starting %s performance data collection", toString(mCurrCollectionEvent));
247 bool isCollectionActive = true;
248 /*
249 * Loop until the collection is not active -- performance collection runs on this thread in
250 * a handler.
251 */
252 while (isCollectionActive) {
253 mHandlerLooper->pollAll(/*timeoutMillis=*/-1);
254 Mutex::Autolock lock(mMutex);
255 isCollectionActive = mCurrCollectionEvent != EventType::TERMINATED;
256 }
257 });
258 return {};
259 }
260
terminate()261 void WatchdogPerfService::terminate() {
262 {
263 Mutex::Autolock lock(mMutex);
264 if (mCurrCollectionEvent == EventType::TERMINATED) {
265 ALOGE("%s was terminated already", kServiceName);
266 return;
267 }
268 ALOGE("Terminating %s as car watchdog is terminating", kServiceName);
269 if (mCurrCollectionEvent != EventType::INIT) {
270 /*
271 * Looper runs only after EventType::INIT has completed so remove looper messages
272 * and wake the looper only when the current collection has changed from INIT.
273 */
274 mHandlerLooper->removeMessages(sp<WatchdogPerfService>::fromExisting(this));
275 mHandlerLooper->wake();
276 }
277 for (const auto& processor : mDataProcessors) {
278 processor->terminate();
279 }
280 mCurrCollectionEvent = EventType::TERMINATED;
281 }
282 if (mCollectionThread.joinable()) {
283 mCollectionThread.join();
284 if (DEBUG) {
285 ALOGD("%s collection thread terminated", kServiceName);
286 }
287 }
288 }
289
setSystemState(SystemState systemState)290 void WatchdogPerfService::setSystemState(SystemState systemState) {
291 Mutex::Autolock lock(mMutex);
292 if (mSystemState != systemState) {
293 ALOGI("%s switching from %s to %s", kServiceName, toString(mSystemState),
294 toString(systemState));
295 }
296 mSystemState = systemState;
297 }
298
onBootFinished()299 Result<void> WatchdogPerfService::onBootFinished() {
300 Mutex::Autolock lock(mMutex);
301 if (EventType expected = EventType::BOOT_TIME_COLLECTION; mCurrCollectionEvent != expected) {
302 /*
303 * This case happens when either the WatchdogPerfService has prematurely terminated before
304 * boot complete notification is received or multiple boot complete notifications are
305 * received. In either case don't return error as this will lead to runtime exception and
306 * cause system to boot loop.
307 */
308 ALOGE("Current performance data collection event %s != %s", toString(mCurrCollectionEvent),
309 toString(expected));
310 return {};
311 }
312 nsecs_t endBootCollectionTime = mHandlerLooper->now() + mPostSystemEventDurationNs.count();
313 mHandlerLooper->sendMessageAtTime(endBootCollectionTime,
314 sp<WatchdogPerfService>::fromExisting(this),
315 SwitchMessage::END_BOOTTIME_COLLECTION);
316 if (DEBUG) {
317 ALOGD("Boot complete signal received.");
318 }
319 return {};
320 }
321
onUserStateChange(userid_t userId,const UserState & userState)322 Result<void> WatchdogPerfService::onUserStateChange(userid_t userId, const UserState& userState) {
323 Mutex::Autolock lock(mMutex);
324 if (mCurrCollectionEvent == EventType::BOOT_TIME_COLLECTION ||
325 mCurrCollectionEvent == EventType::CUSTOM_COLLECTION) {
326 // Ignoring the user switch events because the boot-time and custom collections take
327 // precedence over other collections.
328 return {};
329 }
330 switch (static_cast<int>(userState)) {
331 case static_cast<int>(UserState::USER_STATE_SWITCHING):
332 // TODO(b/243984863): Handle multi-user switching scenario.
333 mUserSwitchCollection.from = mUserSwitchCollection.to;
334 mUserSwitchCollection.to = userId;
335 if (mCurrCollectionEvent != EventType::PERIODIC_COLLECTION &&
336 mCurrCollectionEvent != EventType::USER_SWITCH_COLLECTION) {
337 ALOGE("Unable to start %s. Current performance data collection event: %s",
338 toString(EventType::USER_SWITCH_COLLECTION), toString(mCurrCollectionEvent));
339 return {};
340 }
341 startUserSwitchCollection();
342 ALOGI("Switching to %s (userIds: from = %d, to = %d)", toString(mCurrCollectionEvent),
343 mUserSwitchCollection.from, mUserSwitchCollection.to);
344 break;
345 case static_cast<int>(UserState::USER_STATE_UNLOCKING):
346 if (mCurrCollectionEvent != EventType::PERIODIC_COLLECTION) {
347 if (mCurrCollectionEvent != EventType::USER_SWITCH_COLLECTION) {
348 ALOGE("Unable to start %s. Current performance data collection event: %s",
349 toString(EventType::USER_SWITCH_COLLECTION),
350 toString(mCurrCollectionEvent));
351 }
352 return {};
353 }
354 if (mUserSwitchCollection.to != userId) {
355 return {};
356 }
357 startUserSwitchCollection();
358 ALOGI("Switching to %s (userId: %d)", toString(mCurrCollectionEvent), userId);
359 break;
360 case static_cast<int>(UserState::USER_STATE_POST_UNLOCKED): {
361 if (mCurrCollectionEvent != EventType::USER_SWITCH_COLLECTION) {
362 ALOGE("Ignoring USER_STATE_POST_UNLOCKED because no user switch collection in "
363 "progress. Current performance data collection event: %s.",
364 toString(mCurrCollectionEvent));
365 return {};
366 }
367 if (mUserSwitchCollection.to != userId) {
368 ALOGE("Ignoring USER_STATE_POST_UNLOCKED signal for user id: %d. "
369 "Current user being switched to: %d",
370 userId, mUserSwitchCollection.to);
371 return {};
372 }
373 auto thiz = sp<WatchdogPerfService>::fromExisting(this);
374 mHandlerLooper->removeMessages(thiz, SwitchMessage::END_USER_SWITCH_COLLECTION);
375 nsecs_t endUserSwitchCollectionTime =
376 mHandlerLooper->now() + mPostSystemEventDurationNs.count();
377 mHandlerLooper->sendMessageAtTime(endUserSwitchCollectionTime, thiz,
378 SwitchMessage::END_USER_SWITCH_COLLECTION);
379 break;
380 }
381 default:
382 ALOGE("Unsupported user state: %d", static_cast<int>(userState));
383 return {};
384 }
385 if (DEBUG) {
386 ALOGD("Handled user state change: userId = %d, userState = %d", userId,
387 static_cast<int>(userState));
388 }
389 return {};
390 }
391
startUserSwitchCollection()392 Result<void> WatchdogPerfService::startUserSwitchCollection() {
393 auto thiz = sp<WatchdogPerfService>::fromExisting(this);
394 mHandlerLooper->removeMessages(thiz);
395 mUserSwitchCollection.lastUptime = mHandlerLooper->now();
396 // End |EventType::USER_SWITCH_COLLECTION| after a timeout because the user switch end
397 // signal won't be received within a few seconds when the switch is blocked due to a
398 // keyguard event. Otherwise, polling beyond a few seconds will lead to unnecessary data
399 // collection.
400 nsecs_t uptime = mHandlerLooper->now() + mUserSwitchTimeoutNs.count();
401 mHandlerLooper->sendMessageAtTime(uptime, thiz, SwitchMessage::END_USER_SWITCH_COLLECTION);
402 mCurrCollectionEvent = EventType::USER_SWITCH_COLLECTION;
403 mHandlerLooper->sendMessage(thiz, EventType::USER_SWITCH_COLLECTION);
404 return {};
405 }
406
onSuspendExit()407 Result<void> WatchdogPerfService::onSuspendExit() {
408 Mutex::Autolock lock(mMutex);
409 if (mCurrCollectionEvent == EventType::CUSTOM_COLLECTION) {
410 // Ignoring the suspend exit event because the custom collection takes
411 // precedence over other collections.
412 ALOGE("Unable to start %s. Current performance data collection event: %s",
413 toString(EventType::WAKE_UP_COLLECTION), toString(mCurrCollectionEvent));
414 return {};
415 }
416 if (mCurrCollectionEvent == EventType::WAKE_UP_COLLECTION) {
417 ALOGE("The current performance data collection event is already %s",
418 toString(EventType::WAKE_UP_COLLECTION));
419 return {};
420 }
421 notifySystemStartUpLocked();
422 auto thiz = sp<WatchdogPerfService>::fromExisting(this);
423 mHandlerLooper->removeMessages(thiz);
424 mWakeUpCollection.lastUptime = mHandlerLooper->now();
425 nsecs_t uptime = mHandlerLooper->now() + mWakeUpDurationNs.count();
426 mHandlerLooper->sendMessageAtTime(uptime, thiz, SwitchMessage::END_WAKE_UP_COLLECTION);
427 mCurrCollectionEvent = EventType::WAKE_UP_COLLECTION;
428 mHandlerLooper->sendMessage(thiz, EventType::WAKE_UP_COLLECTION);
429 ALOGI("Switching to %s", toString(mCurrCollectionEvent));
430 return {};
431 }
432
onShutdownEnter()433 Result<void> WatchdogPerfService::onShutdownEnter() {
434 Mutex::Autolock lock(mMutex);
435 if (mCurrCollectionEvent == EventType::CUSTOM_COLLECTION) {
436 ALOGI("Unable to switch to %s during shutdown enter. Current performance data collection "
437 "event: %s",
438 toString(EventType::PERIODIC_COLLECTION), toString(mCurrCollectionEvent));
439 return {};
440 }
441 switchToPeriodicLocked(/*startNow=*/true);
442 return {};
443 }
444
onCustomCollection(int fd,const Vector<String16> & args)445 Result<void> WatchdogPerfService::onCustomCollection(int fd, const Vector<String16>& args) {
446 if (args.empty()) {
447 return Error(BAD_VALUE) << "No custom collection dump arguments";
448 }
449
450 if (args[0] == String16(kStartCustomCollectionFlag)) {
451 if (args.size() > 7) {
452 return Error(BAD_VALUE) << "Number of arguments to start custom performance data "
453 << "collection cannot exceed 7";
454 }
455 std::chrono::nanoseconds interval = kCustomCollectionInterval;
456 std::chrono::nanoseconds maxDuration = kCustomCollectionDuration;
457 std::unordered_set<std::string> filterPackages;
458 for (size_t i = 1; i < args.size(); ++i) {
459 if (args[i] == String16(kIntervalFlag)) {
460 const auto& result = parseSecondsFlag(args, i + 1);
461 if (!result.ok()) {
462 return Error(BAD_VALUE)
463 << "Failed to parse " << kIntervalFlag << ": " << result.error();
464 }
465 interval = std::chrono::duration_cast<std::chrono::nanoseconds>(*result);
466 ++i;
467 continue;
468 }
469 if (args[i] == String16(kMaxDurationFlag)) {
470 const auto& result = parseSecondsFlag(args, i + 1);
471 if (!result.ok()) {
472 return Error(BAD_VALUE)
473 << "Failed to parse " << kMaxDurationFlag << ": " << result.error();
474 }
475 maxDuration = std::chrono::duration_cast<std::chrono::nanoseconds>(*result);
476 ++i;
477 continue;
478 }
479 if (args[i] == String16(kFilterPackagesFlag)) {
480 if (args.size() < i + 1) {
481 return Error(BAD_VALUE)
482 << "Must provide value for '" << kFilterPackagesFlag << "' flag";
483 }
484 std::vector<std::string> packages =
485 Split(std::string(String8(args[i + 1]).string()), ",");
486 std::copy(packages.begin(), packages.end(),
487 std::inserter(filterPackages, filterPackages.end()));
488 ++i;
489 continue;
490 }
491 ALOGW("Unknown flag %s provided to start custom performance data collection",
492 String8(args[i]).string());
493 return Error(BAD_VALUE) << "Unknown flag " << String8(args[i]).string()
494 << " provided to start custom performance data collection";
495 }
496 if (const auto& result = startCustomCollection(interval, maxDuration, filterPackages);
497 !result.ok()) {
498 WriteStringToFd(result.error().message(), fd);
499 return result;
500 }
501 return {};
502 }
503
504 if (args[0] == String16(kEndCustomCollectionFlag)) {
505 if (args.size() != 1) {
506 ALOGW("Number of arguments to stop custom performance data collection cannot exceed 1. "
507 "Stopping the data collection.");
508 WriteStringToFd("Number of arguments to stop custom performance data collection "
509 "cannot exceed 1. Stopping the data collection.",
510 fd);
511 }
512 return endCustomCollection(fd);
513 }
514
515 return Error(BAD_VALUE) << "Custom perf collection dump arguments start neither with "
516 << kStartCustomCollectionFlag << " nor with "
517 << kEndCustomCollectionFlag << " flags";
518 }
519
onDump(int fd) const520 Result<void> WatchdogPerfService::onDump(int fd) const {
521 Mutex::Autolock lock(mMutex);
522 if (mCurrCollectionEvent == EventType::TERMINATED) {
523 ALOGW("%s not active. Dumping cached data", kServiceName);
524 if (!WriteStringToFd(StringPrintf("%s not active. Dumping cached data.", kServiceName),
525 fd)) {
526 return Error(FAILED_TRANSACTION) << "Failed to write " << kServiceName << " status";
527 }
528 }
529
530 if (const auto& result = dumpCollectorsStatusLocked(fd); !result.ok()) {
531 return Error(FAILED_TRANSACTION) << result.error();
532 }
533
534 if (!WriteStringToFd(StringPrintf("\n%s%s report:\n%sBoot-time collection information:\n%s\n",
535 kDumpMajorDelimiter.c_str(), kServiceName,
536 kDumpMajorDelimiter.c_str(), std::string(33, '=').c_str()),
537 fd) ||
538 !WriteStringToFd(mBoottimeCollection.toString(), fd) ||
539 !WriteStringToFd(StringPrintf("\nWake-up collection information:\n%s\n",
540 std::string(31, '=').c_str()),
541 fd) ||
542 !WriteStringToFd(mWakeUpCollection.toString(), fd) ||
543 !WriteStringToFd(StringPrintf("\nUser-switch collection information:\n%s\n",
544 std::string(35, '=').c_str()),
545 fd) ||
546 !WriteStringToFd(mUserSwitchCollection.toString(), fd) ||
547 !WriteStringToFd(StringPrintf("\nPeriodic collection information:\n%s\n",
548 std::string(32, '=').c_str()),
549 fd) ||
550 !WriteStringToFd(mPeriodicCollection.toString(), fd)) {
551 return Error(FAILED_TRANSACTION)
552 << "Failed to dump the boot-time and periodic collection reports.";
553 }
554
555 for (const auto& processor : mDataProcessors) {
556 if (const auto result = processor->onDump(fd); !result.ok()) {
557 return result;
558 }
559 }
560
561 WriteStringToFd(kDumpMajorDelimiter, fd);
562 return {};
563 }
564
dumpHelpText(int fd) const565 bool WatchdogPerfService::dumpHelpText(int fd) const {
566 return WriteStringToFd(StringPrintf(kHelpText, kServiceName, kStartCustomCollectionFlag,
567 kIntervalFlag,
568 std::chrono::duration_cast<std::chrono::seconds>(
569 kCustomCollectionInterval)
570 .count(),
571 kMaxDurationFlag,
572 std::chrono::duration_cast<std::chrono::minutes>(
573 kCustomCollectionDuration)
574 .count(),
575 kFilterPackagesFlag, kEndCustomCollectionFlag),
576 fd);
577 }
578
dumpCollectorsStatusLocked(int fd) const579 Result<void> WatchdogPerfService::dumpCollectorsStatusLocked(int fd) const {
580 if (!mUidStatsCollector->enabled() &&
581 !WriteStringToFd(StringPrintf("UidStatsCollector failed to access proc and I/O files"),
582 fd)) {
583 return Error() << "Failed to write UidStatsCollector status";
584 }
585 if (!mProcStatCollector->enabled() &&
586 !WriteStringToFd(StringPrintf("ProcStat collector failed to access the file %s",
587 mProcStatCollector->filePath().c_str()),
588 fd)) {
589 return Error() << "Failed to write ProcStat collector status";
590 }
591 return {};
592 }
593
startCustomCollection(std::chrono::nanoseconds interval,std::chrono::nanoseconds maxDuration,const std::unordered_set<std::string> & filterPackages)594 Result<void> WatchdogPerfService::startCustomCollection(
595 std::chrono::nanoseconds interval, std::chrono::nanoseconds maxDuration,
596 const std::unordered_set<std::string>& filterPackages) {
597 if (interval < kMinEventInterval || maxDuration < kMinEventInterval) {
598 return Error(INVALID_OPERATION)
599 << "Collection interval and maximum duration must be >= "
600 << std::chrono::duration_cast<std::chrono::milliseconds>(kMinEventInterval).count()
601 << " milliseconds.";
602 }
603 Mutex::Autolock lock(mMutex);
604 if (EventType expected = EventType::PERIODIC_COLLECTION; mCurrCollectionEvent != expected) {
605 return Error(INVALID_OPERATION)
606 << "Cannot start a custom collection when the current collection event "
607 << toString(mCurrCollectionEvent) << " != " << toString(expected)
608 << " collection event";
609 }
610
611 mCustomCollection = {
612 .eventType = EventType::CUSTOM_COLLECTION,
613 .interval = interval,
614 .lastUptime = mHandlerLooper->now(),
615 .filterPackages = filterPackages,
616 };
617
618 auto thiz = sp<WatchdogPerfService>::fromExisting(this);
619 mHandlerLooper->removeMessages(thiz);
620 nsecs_t uptime = mHandlerLooper->now() + maxDuration.count();
621 mHandlerLooper->sendMessageAtTime(uptime, thiz, SwitchMessage::END_CUSTOM_COLLECTION);
622 mCurrCollectionEvent = EventType::CUSTOM_COLLECTION;
623 mHandlerLooper->sendMessage(thiz, EventType::CUSTOM_COLLECTION);
624 ALOGI("Starting %s performance data collection", toString(mCurrCollectionEvent));
625 return {};
626 }
627
endCustomCollection(int fd)628 Result<void> WatchdogPerfService::endCustomCollection(int fd) {
629 Mutex::Autolock lock(mMutex);
630 if (mCurrCollectionEvent != EventType::CUSTOM_COLLECTION) {
631 return Error(INVALID_OPERATION) << "No custom collection is running";
632 }
633
634 auto thiz = sp<WatchdogPerfService>::fromExisting(this);
635 mHandlerLooper->removeMessages(thiz);
636 mHandlerLooper->sendMessage(thiz, SwitchMessage::END_CUSTOM_COLLECTION);
637
638 if (const auto result = dumpCollectorsStatusLocked(fd); !result.ok()) {
639 return Error(FAILED_TRANSACTION) << result.error();
640 }
641
642 if (!WriteStringToFd(StringPrintf("%sPerformance data report for custom collection:\n%s",
643 kDumpMajorDelimiter.c_str(), kDumpMajorDelimiter.c_str()),
644 fd) ||
645 !WriteStringToFd(mCustomCollection.toString(), fd)) {
646 return Error(FAILED_TRANSACTION) << "Failed to write custom collection report.";
647 }
648
649 for (const auto& processor : mDataProcessors) {
650 if (const auto result = processor->onCustomCollectionDump(fd); !result.ok()) {
651 return Error() << processor->name() << " failed on " << toString(mCurrCollectionEvent)
652 << " collection: " << result.error();
653 }
654 }
655
656 if (DEBUG) {
657 ALOGD("Custom event finished");
658 }
659 WriteStringToFd(kDumpMajorDelimiter, fd);
660 return {};
661 }
662
switchToPeriodicLocked(bool startNow)663 void WatchdogPerfService::switchToPeriodicLocked(bool startNow) {
664 if (mCurrCollectionEvent == EventType::PERIODIC_COLLECTION) {
665 ALOGW("The current performance data collection event is already %s",
666 toString(mCurrCollectionEvent));
667 return;
668 }
669 auto thiz = sp<WatchdogPerfService>::fromExisting(this);
670 mHandlerLooper->removeMessages(thiz);
671 mCurrCollectionEvent = EventType::PERIODIC_COLLECTION;
672 mPeriodicCollection.lastUptime = mHandlerLooper->now();
673 if (startNow) {
674 mHandlerLooper->sendMessage(thiz, EventType::PERIODIC_COLLECTION);
675 } else {
676 mPeriodicCollection.lastUptime += mPeriodicCollection.interval.count();
677 mHandlerLooper->sendMessageAtTime(mPeriodicCollection.lastUptime, thiz,
678 EventType::PERIODIC_COLLECTION);
679 }
680 mPeriodicMonitor.lastUptime = mHandlerLooper->now() + mPeriodicMonitor.interval.count();
681 mHandlerLooper->sendMessageAtTime(mPeriodicMonitor.lastUptime, thiz,
682 EventType::PERIODIC_MONITOR);
683 ALOGI("Switching to %s and %s", toString(mCurrCollectionEvent),
684 toString(EventType::PERIODIC_MONITOR));
685 }
686
handleMessage(const Message & message)687 void WatchdogPerfService::handleMessage(const Message& message) {
688 Result<void> result;
689
690 switch (message.what) {
691 case static_cast<int>(EventType::BOOT_TIME_COLLECTION):
692 result = processCollectionEvent(&mBoottimeCollection);
693 break;
694 case static_cast<int>(SwitchMessage::END_BOOTTIME_COLLECTION):
695 mHandlerLooper->removeMessages(sp<WatchdogPerfService>::fromExisting(this));
696 if (result = processCollectionEvent(&mBoottimeCollection); result.ok()) {
697 Mutex::Autolock lock(mMutex);
698 switchToPeriodicLocked(/*startNow=*/false);
699 }
700 break;
701 case static_cast<int>(EventType::PERIODIC_COLLECTION):
702 result = processCollectionEvent(&mPeriodicCollection);
703 break;
704 case static_cast<int>(EventType::USER_SWITCH_COLLECTION):
705 result = processCollectionEvent(&mUserSwitchCollection);
706 break;
707 case static_cast<int>(EventType::WAKE_UP_COLLECTION):
708 result = processCollectionEvent(&mWakeUpCollection);
709 break;
710 case static_cast<int>(SwitchMessage::END_USER_SWITCH_COLLECTION):
711 case static_cast<int>(SwitchMessage::END_WAKE_UP_COLLECTION): {
712 mHandlerLooper->removeMessages(sp<WatchdogPerfService>::fromExisting(this));
713 EventMetadata* eventMetadata =
714 message.what == static_cast<int>(SwitchMessage::END_USER_SWITCH_COLLECTION)
715 ? &mUserSwitchCollection
716 : &mWakeUpCollection;
717 if (result = processCollectionEvent(eventMetadata); result.ok()) {
718 Mutex::Autolock lock(mMutex);
719 switchToPeriodicLocked(/*startNow=*/false);
720 }
721 break;
722 }
723 case static_cast<int>(EventType::CUSTOM_COLLECTION):
724 result = processCollectionEvent(&mCustomCollection);
725 break;
726 case static_cast<int>(EventType::PERIODIC_MONITOR):
727 result = processMonitorEvent(&mPeriodicMonitor);
728 break;
729 case static_cast<int>(SwitchMessage::END_CUSTOM_COLLECTION): {
730 Mutex::Autolock lock(mMutex);
731 if (EventType expected = EventType::CUSTOM_COLLECTION;
732 mCurrCollectionEvent != expected) {
733 ALOGW("Skipping END_CUSTOM_COLLECTION message as the current collection %s != %s",
734 toString(mCurrCollectionEvent), toString(expected));
735 return;
736 }
737 mCustomCollection = {};
738 for (const auto& processor : mDataProcessors) {
739 /*
740 * Clear custom collection cache on the data processors when the custom collection
741 * ends.
742 */
743 processor->onCustomCollectionDump(-1);
744 }
745 switchToPeriodicLocked(/*startNow=*/true);
746 return;
747 }
748 default:
749 result = Error() << "Unknown message: " << message.what;
750 }
751
752 if (!result.ok()) {
753 Mutex::Autolock lock(mMutex);
754 ALOGE("Terminating %s: %s", kServiceName, result.error().message().c_str());
755 /*
756 * DO NOT CALL terminate() as it tries to join the collection thread but this code is
757 * executed on the collection thread. Thus it will result in a deadlock.
758 */
759 mCurrCollectionEvent = EventType::TERMINATED;
760 mHandlerLooper->removeMessages(sp<WatchdogPerfService>::fromExisting(this));
761 mHandlerLooper->wake();
762 }
763 }
764
processCollectionEvent(WatchdogPerfService::EventMetadata * metadata)765 Result<void> WatchdogPerfService::processCollectionEvent(
766 WatchdogPerfService::EventMetadata* metadata) {
767 Mutex::Autolock lock(mMutex);
768 /*
769 * Messages sent to the looper are intrinsically racy such that a message from the previous
770 * collection event may land in the looper after the current collection has already begun. Thus
771 * verify the current collection event before starting the collection.
772 */
773 if (mCurrCollectionEvent != metadata->eventType) {
774 ALOGW("Skipping %s event on collection event %s", toString(metadata->eventType),
775 toString(mCurrCollectionEvent));
776 return {};
777 }
778 if (DEBUG) {
779 ALOGD("Processing %s collection event", toString(metadata->eventType));
780 }
781 if (metadata->interval < kMinEventInterval) {
782 return Error()
783 << "Collection interval of "
784 << std::chrono::duration_cast<std::chrono::seconds>(metadata->interval).count()
785 << " seconds for " << toString(metadata->eventType)
786 << " collection cannot be less than "
787 << std::chrono::duration_cast<std::chrono::seconds>(kMinEventInterval).count()
788 << " seconds";
789 }
790 if (const auto result = collectLocked(metadata); !result.ok()) {
791 return Error() << toString(metadata->eventType) << " collection failed: " << result.error();
792 }
793 metadata->lastUptime += metadata->interval.count();
794 mHandlerLooper->sendMessageAtTime(metadata->lastUptime,
795 sp<WatchdogPerfService>::fromExisting(this),
796 metadata->eventType);
797 return {};
798 }
799
collectLocked(WatchdogPerfService::EventMetadata * metadata)800 Result<void> WatchdogPerfService::collectLocked(WatchdogPerfService::EventMetadata* metadata) {
801 if (!mUidStatsCollector->enabled() && !mProcStatCollector->enabled()) {
802 return Error() << "No collectors enabled";
803 }
804
805 time_t now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
806
807 if (mUidStatsCollector->enabled()) {
808 if (const auto result = mUidStatsCollector->collect(); !result.ok()) {
809 return Error() << "Failed to collect per-uid proc and I/O stats: " << result.error();
810 }
811 }
812
813 if (mProcStatCollector->enabled()) {
814 if (const auto result = mProcStatCollector->collect(); !result.ok()) {
815 return Error() << "Failed to collect proc stats: " << result.error();
816 }
817 }
818
819 for (const auto& processor : mDataProcessors) {
820 Result<void> result;
821 switch (mCurrCollectionEvent) {
822 case EventType::BOOT_TIME_COLLECTION:
823 result = processor->onBoottimeCollection(now, mUidStatsCollector,
824 mProcStatCollector);
825 break;
826 case EventType::PERIODIC_COLLECTION:
827 result = processor->onPeriodicCollection(now, mSystemState, mUidStatsCollector,
828 mProcStatCollector);
829 break;
830 case EventType::USER_SWITCH_COLLECTION: {
831 WatchdogPerfService::UserSwitchEventMetadata* userSwitchMetadata =
832 static_cast<WatchdogPerfService::UserSwitchEventMetadata*>(metadata);
833 result = processor->onUserSwitchCollection(now, userSwitchMetadata->from,
834 userSwitchMetadata->to,
835 mUidStatsCollector, mProcStatCollector);
836 break;
837 }
838 case EventType::WAKE_UP_COLLECTION:
839 result = processor->onWakeUpCollection(now, mUidStatsCollector, mProcStatCollector);
840 break;
841 case EventType::CUSTOM_COLLECTION:
842 result = processor->onCustomCollection(now, mSystemState, metadata->filterPackages,
843 mUidStatsCollector, mProcStatCollector);
844 break;
845 default:
846 result = Error() << "Invalid collection event " << toString(mCurrCollectionEvent);
847 }
848 if (!result.ok()) {
849 return Error() << processor->name() << " failed on " << toString(mCurrCollectionEvent)
850 << " collection: " << result.error();
851 }
852 }
853
854 return {};
855 }
856
processMonitorEvent(WatchdogPerfService::EventMetadata * metadata)857 Result<void> WatchdogPerfService::processMonitorEvent(
858 WatchdogPerfService::EventMetadata* metadata) {
859 if (metadata->eventType != static_cast<int>(EventType::PERIODIC_MONITOR)) {
860 return Error() << "Invalid monitor event " << toString(metadata->eventType);
861 }
862 if (DEBUG) {
863 ALOGD("Processing %s monitor event", toString(metadata->eventType));
864 }
865 if (metadata->interval < kMinEventInterval) {
866 return Error()
867 << "Monitor interval of "
868 << std::chrono::duration_cast<std::chrono::seconds>(metadata->interval).count()
869 << " seconds for " << toString(metadata->eventType) << " event cannot be less than "
870 << std::chrono::duration_cast<std::chrono::seconds>(kMinEventInterval).count()
871 << " seconds";
872 }
873 Mutex::Autolock lock(mMutex);
874 if (!mProcDiskStatsCollector->enabled()) {
875 return Error() << "Cannot access proc disk stats for monitoring";
876 }
877 time_t now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
878 if (const auto result = mProcDiskStatsCollector->collect(); !result.ok()) {
879 return Error() << "Failed to collect disk stats: " << result.error();
880 }
881 auto* currCollectionMetadata = currCollectionMetadataLocked();
882 if (currCollectionMetadata == nullptr) {
883 return Error() << "No metadata available for current collection event: "
884 << toString(mCurrCollectionEvent);
885 }
886 bool requestedCollection = false;
887 auto thiz = sp<WatchdogPerfService>::fromExisting(this);
888 const auto requestCollection = [&]() mutable {
889 if (requestedCollection) {
890 return;
891 }
892 const nsecs_t prevUptime =
893 currCollectionMetadata->lastUptime - currCollectionMetadata->interval.count();
894 nsecs_t uptime = mHandlerLooper->now();
895 if (const auto delta = std::abs(uptime - prevUptime); delta < kMinEventInterval.count()) {
896 return;
897 }
898 currCollectionMetadata->lastUptime = uptime;
899 mHandlerLooper->removeMessages(thiz, currCollectionMetadata->eventType);
900 mHandlerLooper->sendMessage(thiz, currCollectionMetadata->eventType);
901 requestedCollection = true;
902 };
903 for (const auto& processor : mDataProcessors) {
904 if (const auto result =
905 processor->onPeriodicMonitor(now, mProcDiskStatsCollector, requestCollection);
906 !result.ok()) {
907 return Error() << processor->name() << " failed on " << toString(metadata->eventType)
908 << ": " << result.error();
909 }
910 }
911 metadata->lastUptime += metadata->interval.count();
912 if (metadata->lastUptime == currCollectionMetadata->lastUptime) {
913 /*
914 * If the |PERIODIC_MONITOR| and *_COLLECTION events overlap, skip the |PERIODIC_MONITOR|
915 * event.
916 */
917 metadata->lastUptime += metadata->interval.count();
918 }
919 mHandlerLooper->sendMessageAtTime(metadata->lastUptime, thiz, metadata->eventType);
920 return {};
921 }
922
notifySystemStartUpLocked()923 Result<void> WatchdogPerfService::notifySystemStartUpLocked() {
924 for (const auto& processor : mDataProcessors) {
925 if (const auto result = processor->onSystemStartup(); !result.ok()) {
926 ALOGE("%s failed to process system startup event", processor->name().c_str());
927 return Error() << processor->name() << " failed to process system startup event";
928 }
929 }
930 return {};
931 }
932
currCollectionMetadataLocked()933 WatchdogPerfService::EventMetadata* WatchdogPerfService::currCollectionMetadataLocked() {
934 switch (mCurrCollectionEvent) {
935 case EventType::BOOT_TIME_COLLECTION:
936 return &mBoottimeCollection;
937 case EventType::PERIODIC_COLLECTION:
938 return &mPeriodicCollection;
939 case EventType::CUSTOM_COLLECTION:
940 return &mCustomCollection;
941 default:
942 return nullptr;
943 }
944 }
945
946 } // namespace watchdog
947 } // namespace automotive
948 } // namespace android
949