1 /*
2 * Copyright (c) 2020, The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #define LOG_TAG "carwatchdogd"
18 #define DEBUG false // STOPSHIP if true.
19
20 #include "WatchdogPerfService.h"
21
22 #include <android-base/file.h>
23 #include <android-base/parseint.h>
24 #include <android-base/stringprintf.h>
25 #include <android-base/strings.h>
26 #include <log/log.h>
27 #include <processgroup/sched_policy.h>
28
29 #include <pthread.h>
30
31 #include <iterator>
32 #include <vector>
33
34 namespace android {
35 namespace automotive {
36 namespace watchdog {
37
38 namespace {
39
40 using ::aidl::android::automotive::watchdog::internal::ResourceStats;
41 using ::aidl::android::automotive::watchdog::internal::UserState;
42 using ::android::sp;
43 using ::android::String16;
44 using ::android::String8;
45 using ::android::base::EqualsIgnoreCase;
46 using ::android::base::Error;
47 using ::android::base::Join;
48 using ::android::base::ParseUint;
49 using ::android::base::Result;
50 using ::android::base::Split;
51 using ::android::base::StringAppendF;
52 using ::android::base::StringPrintf;
53 using ::android::base::WriteStringToFd;
54
55 const int32_t kMaxCachedUnsentResourceStats = 10;
56 const std::chrono::nanoseconds kPrevUnsentResourceStatsDelayNs = 3s;
57 // Minimum required collection polling interval between subsequent collections.
58 const std::chrono::nanoseconds kMinEventInterval = 1s;
59 const std::chrono::seconds kDefaultSystemEventCollectionInterval = 1s;
60 const std::chrono::seconds kDefaultPeriodicCollectionInterval = 20s;
61 const std::chrono::seconds kDefaultPeriodicMonitorInterval = 5s;
62 const std::chrono::nanoseconds kCustomCollectionInterval = 10s;
63 const std::chrono::nanoseconds kCustomCollectionDuration = 30min;
64
65 constexpr const char* kServiceName = "WatchdogPerfService";
66 static const std::string kDumpMajorDelimiter = std::string(100, '-') + "\n"; // NOLINT
67 constexpr const char* kHelpText =
68 "\n%s dump options:\n"
69 "%s: Starts custom performance data collection. Customize the collection behavior with "
70 "the following optional arguments:\n"
71 "\t%s <seconds>: Modifies the collection interval. Default behavior is to collect once "
72 "every %lld seconds.\n"
73 "\t%s <seconds>: Modifies the maximum collection duration. Default behavior is to collect "
74 "until %ld minutes before automatically stopping the custom collection and discarding "
75 "the collected data.\n"
76 "\t%s <package name>,<package name>,...: Comma-separated value containing package names. "
77 "When provided, the results are filtered only to the provided package names. Default "
78 "behavior is to list the results for the top N packages.\n"
79 "%s: Stops custom performance data collection and generates a dump of "
80 "the collection report.\n\n"
81 "When no options are specified, the car watchdog report contains the performance data "
82 "collected during boot-time and over the last few minutes before the report generation.\n";
83
parseSecondsFlag(const char ** args,uint32_t numArgs,size_t pos)84 Result<std::chrono::seconds> parseSecondsFlag(const char** args, uint32_t numArgs, size_t pos) {
85 if (numArgs <= pos) {
86 return Error() << "Value not provided";
87 }
88 uint64_t value;
89 if (std::string strValue = std::string(args[pos]); !ParseUint(strValue, &value)) {
90 return Error() << "Invalid value " << strValue << ", must be an integer";
91 }
92 return std::chrono::seconds(value);
93 }
94
toString(std::variant<EventType,SwitchMessage> what)95 constexpr const char* toString(std::variant<EventType, SwitchMessage> what) {
96 return std::visit(
97 [&](const auto& v) -> const char* {
98 switch (static_cast<int>(v)) {
99 case EventType::INIT:
100 return "INIT";
101 case EventType::TERMINATED:
102 return "TERMINATED";
103 case EventType::BOOT_TIME_COLLECTION:
104 return "BOOT_TIME_COLLECTION";
105 case EventType::PERIODIC_COLLECTION:
106 return "PERIODIC_COLLECTION";
107 case EventType::USER_SWITCH_COLLECTION:
108 return "USER_SWITCH_COLLECTION";
109 case EventType::WAKE_UP_COLLECTION:
110 return "WAKE_UP_COLLECTION";
111 case EventType::CUSTOM_COLLECTION:
112 return "CUSTOM_COLLECTION";
113 case EventType::PERIODIC_MONITOR:
114 return "PERIODIC_MONITOR";
115 case EventType::LAST_EVENT:
116 return "LAST_EVENT";
117 case SwitchMessage::END_BOOTTIME_COLLECTION:
118 return "END_BOOTTIME_COLLECTION";
119 case SwitchMessage::END_USER_SWITCH_COLLECTION:
120 return "END_USER_SWITCH_COLLECTION";
121 case SwitchMessage::END_WAKE_UP_COLLECTION:
122 return "END_WAKE_UP_COLLECTION";
123 case SwitchMessage::END_CUSTOM_COLLECTION:
124 return "END_CUSTOM_COLLECTION";
125 default:
126 return "INVALID_EVENT_OR_SWITCH_MESSAGE";
127 }
128 },
129 what);
130 }
131
toString(SystemState systemState)132 constexpr const char* toString(SystemState systemState) {
133 switch (systemState) {
134 case SystemState::NORMAL_MODE:
135 return "NORMAL_MODE";
136 case SystemState::GARAGE_MODE:
137 return "GARAGE_MODE";
138 default:
139 return "UNKNOWN MODE";
140 }
141 }
142
isEmpty(const ResourceStats & resourceStats)143 bool isEmpty(const ResourceStats& resourceStats) {
144 return !resourceStats.resourceUsageStats.has_value() &&
145 !resourceStats.resourceOveruseStats.has_value();
146 }
147
148 } // namespace
149
toString() const150 std::string WatchdogPerfService::EventMetadata::toString() const {
151 std::string buffer;
152 const auto intervalInSecs =
153 std::chrono::duration_cast<std::chrono::seconds>(pollingIntervalNs).count();
154 StringAppendF(&buffer, "Event polling interval: %lld second%s\n", intervalInSecs,
155 ((intervalInSecs > 1) ? "s" : ""));
156 if (!filterPackages.empty()) {
157 std::vector<std::string> packages(filterPackages.begin(), filterPackages.end());
158 StringAppendF(&buffer, "Filtered results to packages: %s\n", Join(packages, ", ").c_str());
159 }
160 return buffer;
161 }
162
registerDataProcessor(sp<DataProcessorInterface> processor)163 Result<void> WatchdogPerfService::registerDataProcessor(sp<DataProcessorInterface> processor) {
164 if (processor == nullptr) {
165 return Error() << "Must provide a valid data processor";
166 }
167 if (const auto result = processor->init(); !result.ok()) {
168 return Error() << "Failed to initialize " << processor->name().c_str() << ": "
169 << result.error().message();
170 }
171 Mutex::Autolock lock(mMutex);
172 mDataProcessors.push_back(processor);
173 if (DEBUG) {
174 ALOGD("Successfully registered %s to %s", processor->name().c_str(), kServiceName);
175 }
176 return {};
177 }
178
start()179 Result<void> WatchdogPerfService::start() {
180 {
181 Mutex::Autolock lock(mMutex);
182 if (mCurrCollectionEvent != EventType::INIT || mCollectionThread.joinable()) {
183 return Error(INVALID_OPERATION) << "Cannot start " << kServiceName << " more than once";
184 }
185 if (mWatchdogServiceHelper == nullptr) {
186 return Error(INVALID_OPERATION) << "No watchdog service helper is registered";
187 }
188 std::chrono::nanoseconds systemEventCollectionInterval =
189 std::chrono::duration_cast<std::chrono::nanoseconds>(
190 std::chrono::seconds(sysprop::systemEventCollectionInterval().value_or(
191 kDefaultSystemEventCollectionInterval.count())));
192 std::chrono::nanoseconds periodicCollectionInterval =
193 std::chrono::duration_cast<std::chrono::nanoseconds>(
194 std::chrono::seconds(sysprop::periodicCollectionInterval().value_or(
195 kDefaultPeriodicCollectionInterval.count())));
196 std::chrono::nanoseconds periodicMonitorInterval =
197 std::chrono::duration_cast<std::chrono::nanoseconds>(
198 std::chrono::seconds(sysprop::periodicMonitorInterval().value_or(
199 kDefaultPeriodicMonitorInterval.count())));
200 mBoottimeCollection = {
201 .eventType = EventType::BOOT_TIME_COLLECTION,
202 .pollingIntervalNs = systemEventCollectionInterval,
203 };
204 mPeriodicCollection = {
205 .eventType = EventType::PERIODIC_COLLECTION,
206 .pollingIntervalNs = periodicCollectionInterval,
207 };
208 mUserSwitchCollection = {{
209 .eventType = EventType::USER_SWITCH_COLLECTION,
210 .pollingIntervalNs = systemEventCollectionInterval,
211 }};
212 mWakeUpCollection = {
213 .eventType = EventType::WAKE_UP_COLLECTION,
214 .pollingIntervalNs = systemEventCollectionInterval,
215 };
216 mPeriodicMonitor = {
217 .eventType = EventType::PERIODIC_MONITOR,
218 .pollingIntervalNs = periodicMonitorInterval,
219 };
220 if (mDataProcessors.empty()) {
221 ALOGE("Terminating %s: No data processor is registered", kServiceName);
222 mCurrCollectionEvent = EventType::TERMINATED;
223 return Error() << "No data processor is registered";
224 }
225 mUidStatsCollector->init();
226 mProcStatCollector->init();
227 mProcDiskStatsCollector->init();
228 }
229
230 mCollectionThread = std::thread([&]() {
231 {
232 Mutex::Autolock lock(mMutex);
233 if (EventType expected = EventType::INIT; mCurrCollectionEvent != expected) {
234 ALOGE("Skipping performance data collection as the current collection event "
235 "%s != %s",
236 toString(mCurrCollectionEvent), toString(expected));
237 return;
238 }
239 notifySystemStartUpLocked();
240 mCurrCollectionEvent = EventType::BOOT_TIME_COLLECTION;
241 mBoottimeCollection.lastPollUptimeNs = mHandlerLooper->now();
242 mHandlerLooper->setLooper(Looper::prepare(/*opts=*/0));
243 mHandlerLooper->sendMessage(sp<WatchdogPerfService>::fromExisting(this),
244 EventType::BOOT_TIME_COLLECTION);
245 }
246 if (set_sched_policy(0, SP_BACKGROUND) != 0) {
247 ALOGW("Failed to set background scheduling priority to %s thread", kServiceName);
248 }
249 if (int result = pthread_setname_np(pthread_self(), "WatchdogPerfSvc"); result != 0) {
250 ALOGE("Failed to set %s thread name: %d", kServiceName, result);
251 }
252 ALOGI("Starting %s performance data collection", toString(mCurrCollectionEvent));
253 bool isCollectionActive = true;
254 /*
255 * Loop until the collection is not active -- performance collection runs on this thread in
256 * a handler.
257 */
258 while (isCollectionActive) {
259 mHandlerLooper->pollAll(/*timeoutMillis=*/-1);
260 Mutex::Autolock lock(mMutex);
261 isCollectionActive = mCurrCollectionEvent != EventType::TERMINATED;
262 }
263 });
264 return {};
265 }
266
terminate()267 void WatchdogPerfService::terminate() {
268 {
269 Mutex::Autolock lock(mMutex);
270 if (mCurrCollectionEvent == EventType::TERMINATED) {
271 ALOGE("%s was terminated already", kServiceName);
272 return;
273 }
274 ALOGE("Terminating %s as car watchdog is terminating", kServiceName);
275 if (mCurrCollectionEvent != EventType::INIT) {
276 /*
277 * Looper runs only after EventType::INIT has completed so remove looper messages
278 * and wake the looper only when the current collection has changed from INIT.
279 */
280 mHandlerLooper->removeMessages(sp<WatchdogPerfService>::fromExisting(this));
281 mHandlerLooper->wake();
282 }
283 for (const auto& processor : mDataProcessors) {
284 processor->terminate();
285 }
286 mCurrCollectionEvent = EventType::TERMINATED;
287 mUnsentResourceStats.clear();
288 }
289 if (mCollectionThread.joinable()) {
290 mCollectionThread.join();
291 if (DEBUG) {
292 ALOGD("%s collection thread terminated", kServiceName);
293 }
294 }
295 }
296
setSystemState(SystemState systemState)297 void WatchdogPerfService::setSystemState(SystemState systemState) {
298 Mutex::Autolock lock(mMutex);
299 if (mSystemState != systemState) {
300 ALOGI("%s switching from %s to %s", kServiceName, toString(mSystemState),
301 toString(systemState));
302 }
303 mSystemState = systemState;
304 }
305
onCarWatchdogServiceRegistered()306 void WatchdogPerfService::onCarWatchdogServiceRegistered() {
307 Mutex::Autolock lock(mMutex);
308 for (const auto& processor : mDataProcessors) {
309 processor->onCarWatchdogServiceRegistered();
310 }
311 if (mUnsentResourceStats.empty()) {
312 return;
313 }
314 mHandlerLooper->sendMessage(sp<WatchdogPerfService>::fromExisting(this),
315 TaskMessage::SEND_RESOURCE_STATS);
316 }
317
onBootFinished()318 Result<void> WatchdogPerfService::onBootFinished() {
319 Mutex::Autolock lock(mMutex);
320 if (EventType expected = EventType::BOOT_TIME_COLLECTION; mCurrCollectionEvent != expected) {
321 /*
322 * This case happens when either the WatchdogPerfService has prematurely terminated before
323 * boot complete notification is received or multiple boot complete notifications are
324 * received. In either case don't return error as this will lead to runtime exception and
325 * cause system to boot loop.
326 */
327 ALOGE("Current performance data collection event %s != %s", toString(mCurrCollectionEvent),
328 toString(expected));
329 return {};
330 }
331 mHandlerLooper->sendMessageAtTime(mHandlerLooper->now() + mPostSystemEventDurationNs.count(),
332 sp<WatchdogPerfService>::fromExisting(this),
333 SwitchMessage::END_BOOTTIME_COLLECTION);
334 if (DEBUG) {
335 ALOGD("Boot complete signal received.");
336 }
337 return {};
338 }
339
onUserStateChange(userid_t userId,const UserState & userState)340 Result<void> WatchdogPerfService::onUserStateChange(userid_t userId, const UserState& userState) {
341 Mutex::Autolock lock(mMutex);
342 if (mCurrCollectionEvent == EventType::BOOT_TIME_COLLECTION ||
343 mCurrCollectionEvent == EventType::CUSTOM_COLLECTION) {
344 // Ignoring the user switch events because the boot-time and custom collections take
345 // precedence over other collections.
346 if (mCurrCollectionEvent == EventType::CUSTOM_COLLECTION) {
347 ALOGW("Unable to start %s. Current performance data collection event: %s",
348 toString(EventType::USER_SWITCH_COLLECTION), toString(mCurrCollectionEvent));
349 }
350 return {};
351 }
352 switch (static_cast<int>(userState)) {
353 case static_cast<int>(UserState::USER_STATE_SWITCHING):
354 // TODO(b/243984863): Handle multi-user switching scenario.
355 mUserSwitchCollection.from = mUserSwitchCollection.to;
356 mUserSwitchCollection.to = userId;
357 if (mCurrCollectionEvent != EventType::PERIODIC_COLLECTION &&
358 mCurrCollectionEvent != EventType::USER_SWITCH_COLLECTION) {
359 ALOGE("Unable to start %s. Current performance data collection event: %s",
360 toString(EventType::USER_SWITCH_COLLECTION), toString(mCurrCollectionEvent));
361 return {};
362 }
363 startUserSwitchCollection();
364 ALOGI("Switching to %s (userIds: from = %d, to = %d)", toString(mCurrCollectionEvent),
365 mUserSwitchCollection.from, mUserSwitchCollection.to);
366 break;
367 case static_cast<int>(UserState::USER_STATE_UNLOCKING):
368 if (mCurrCollectionEvent != EventType::PERIODIC_COLLECTION) {
369 if (mCurrCollectionEvent != EventType::USER_SWITCH_COLLECTION) {
370 ALOGE("Unable to start %s. Current performance data collection event: %s",
371 toString(EventType::USER_SWITCH_COLLECTION),
372 toString(mCurrCollectionEvent));
373 }
374 return {};
375 }
376 if (mUserSwitchCollection.to != userId) {
377 return {};
378 }
379 startUserSwitchCollection();
380 ALOGI("Switching to %s (userId: %d)", toString(mCurrCollectionEvent), userId);
381 break;
382 case static_cast<int>(UserState::USER_STATE_POST_UNLOCKED): {
383 if (mCurrCollectionEvent != EventType::USER_SWITCH_COLLECTION) {
384 ALOGE("Ignoring USER_STATE_POST_UNLOCKED because no user switch collection in "
385 "progress. Current performance data collection event: %s.",
386 toString(mCurrCollectionEvent));
387 return {};
388 }
389 if (mUserSwitchCollection.to != userId) {
390 ALOGE("Ignoring USER_STATE_POST_UNLOCKED signal for user id: %d. "
391 "Current user being switched to: %d",
392 userId, mUserSwitchCollection.to);
393 return {};
394 }
395 auto thiz = sp<WatchdogPerfService>::fromExisting(this);
396 mHandlerLooper->removeMessages(thiz, SwitchMessage::END_USER_SWITCH_COLLECTION);
397 nsecs_t endUserSwitchCollectionTime =
398 mHandlerLooper->now() + mPostSystemEventDurationNs.count();
399 mHandlerLooper->sendMessageAtTime(endUserSwitchCollectionTime, thiz,
400 SwitchMessage::END_USER_SWITCH_COLLECTION);
401 break;
402 }
403 default:
404 ALOGE("Unsupported user state: %d", static_cast<int>(userState));
405 return {};
406 }
407 if (DEBUG) {
408 ALOGD("Handled user state change: userId = %d, userState = %d", userId,
409 static_cast<int>(userState));
410 }
411 return {};
412 }
413
startUserSwitchCollection()414 Result<void> WatchdogPerfService::startUserSwitchCollection() {
415 auto thiz = sp<WatchdogPerfService>::fromExisting(this);
416 mHandlerLooper->removeMessages(thiz);
417 mUserSwitchCollection.lastPollUptimeNs = mHandlerLooper->now();
418 // End |EventType::USER_SWITCH_COLLECTION| after a timeout because the user switch end
419 // signal won't be received within a few seconds when the switch is blocked due to a
420 // keyguard event. Otherwise, polling beyond a few seconds will lead to unnecessary data
421 // collection.
422 mHandlerLooper->sendMessageAtTime(mHandlerLooper->now() + mUserSwitchTimeoutNs.count(), thiz,
423 SwitchMessage::END_USER_SWITCH_COLLECTION);
424 mCurrCollectionEvent = EventType::USER_SWITCH_COLLECTION;
425 mHandlerLooper->sendMessage(thiz, EventType::USER_SWITCH_COLLECTION);
426 return {};
427 }
428
onSuspendExit()429 Result<void> WatchdogPerfService::onSuspendExit() {
430 Mutex::Autolock lock(mMutex);
431 if (mCurrCollectionEvent == EventType::CUSTOM_COLLECTION) {
432 // Ignoring the suspend exit event because the custom collection takes
433 // precedence over other collections.
434 ALOGE("Unable to start %s. Current performance data collection event: %s",
435 toString(EventType::WAKE_UP_COLLECTION), toString(mCurrCollectionEvent));
436 return {};
437 }
438 if (mCurrCollectionEvent == EventType::WAKE_UP_COLLECTION) {
439 ALOGE("The current performance data collection event is already %s",
440 toString(EventType::WAKE_UP_COLLECTION));
441 return {};
442 }
443 notifySystemStartUpLocked();
444 auto thiz = sp<WatchdogPerfService>::fromExisting(this);
445 mHandlerLooper->removeMessages(thiz);
446 nsecs_t now = mHandlerLooper->now();
447 mWakeUpCollection.lastPollUptimeNs = now;
448 mHandlerLooper->sendMessageAtTime(now + mWakeUpDurationNs.count(), thiz,
449 SwitchMessage::END_WAKE_UP_COLLECTION);
450 mCurrCollectionEvent = EventType::WAKE_UP_COLLECTION;
451 mHandlerLooper->sendMessage(thiz, EventType::WAKE_UP_COLLECTION);
452 ALOGI("Switching to %s", toString(mCurrCollectionEvent));
453 return {};
454 }
455
onShutdownEnter()456 Result<void> WatchdogPerfService::onShutdownEnter() {
457 Mutex::Autolock lock(mMutex);
458 if (mCurrCollectionEvent == EventType::CUSTOM_COLLECTION) {
459 ALOGI("Unable to switch to %s during shutdown enter. Current performance data collection "
460 "event: %s",
461 toString(EventType::PERIODIC_COLLECTION), toString(mCurrCollectionEvent));
462 return {};
463 }
464 switchToPeriodicLocked(/*startNow=*/true);
465 return {};
466 }
467
onCustomCollection(int fd,const char ** args,uint32_t numArgs)468 Result<void> WatchdogPerfService::onCustomCollection(int fd, const char** args, uint32_t numArgs) {
469 if (numArgs == 0) {
470 return Error(BAD_VALUE) << "No custom collection dump arguments";
471 }
472
473 if (EqualsIgnoreCase(args[0], kStartCustomCollectionFlag)) {
474 if (numArgs > 7) {
475 return Error(BAD_VALUE) << "Number of arguments to start custom performance data "
476 << "collection cannot exceed 7";
477 }
478 std::chrono::nanoseconds interval = kCustomCollectionInterval;
479 std::chrono::nanoseconds maxDuration = kCustomCollectionDuration;
480 std::unordered_set<std::string> filterPackages;
481 for (uint32_t i = 1; i < numArgs; ++i) {
482 if (EqualsIgnoreCase(args[i], kIntervalFlag)) {
483 const auto& result = parseSecondsFlag(args, numArgs, i + 1);
484 if (!result.ok()) {
485 return Error(BAD_VALUE)
486 << "Failed to parse " << kIntervalFlag << ": " << result.error();
487 }
488 interval = std::chrono::duration_cast<std::chrono::nanoseconds>(*result);
489 ++i;
490 continue;
491 }
492 if (EqualsIgnoreCase(args[i], kMaxDurationFlag)) {
493 const auto& result = parseSecondsFlag(args, numArgs, i + 1);
494 if (!result.ok()) {
495 return Error(BAD_VALUE)
496 << "Failed to parse " << kMaxDurationFlag << ": " << result.error();
497 }
498 maxDuration = std::chrono::duration_cast<std::chrono::nanoseconds>(*result);
499 ++i;
500 continue;
501 }
502 if (EqualsIgnoreCase(args[i], kFilterPackagesFlag)) {
503 if (numArgs < i + 1) {
504 return Error(BAD_VALUE)
505 << "Must provide value for '" << kFilterPackagesFlag << "' flag";
506 }
507 std::vector<std::string> packages = Split(std::string(args[i + 1]), ",");
508 std::copy(packages.begin(), packages.end(),
509 std::inserter(filterPackages, filterPackages.end()));
510 ++i;
511 continue;
512 }
513 return Error(BAD_VALUE) << "Unknown flag " << args[i]
514 << " provided to start custom performance data collection";
515 }
516 if (const auto& result = startCustomCollection(interval, maxDuration, filterPackages);
517 !result.ok()) {
518 return result;
519 }
520 return {};
521 }
522 if (EqualsIgnoreCase(args[0], kEndCustomCollectionFlag)) {
523 if (numArgs != 1) {
524 ALOGW("Number of arguments to stop custom performance data collection cannot exceed 1. "
525 "Stopping the data collection.");
526 WriteStringToFd("Number of arguments to stop custom performance data collection "
527 "cannot exceed 1. Stopping the data collection.",
528 fd);
529 }
530 return endCustomCollection(fd);
531 }
532 return Error(BAD_VALUE) << "Custom perf collection dump arguments start neither with "
533 << kStartCustomCollectionFlag << " nor with "
534 << kEndCustomCollectionFlag << " flags";
535 }
536
onDump(int fd) const537 Result<void> WatchdogPerfService::onDump(int fd) const {
538 Mutex::Autolock lock(mMutex);
539 if (mCurrCollectionEvent == EventType::TERMINATED) {
540 ALOGW("%s not active. Dumping cached data", kServiceName);
541 if (!WriteStringToFd(StringPrintf("%s not active. Dumping cached data.", kServiceName),
542 fd)) {
543 return Error(FAILED_TRANSACTION) << "Failed to write " << kServiceName << " status";
544 }
545 }
546
547 if (const auto& result = dumpCollectorsStatusLocked(fd); !result.ok()) {
548 return Error(FAILED_TRANSACTION) << result.error();
549 }
550
551 if (!WriteStringToFd(StringPrintf("\n%s%s report:\n%sBoot-time collection information:\n%s\n",
552 kDumpMajorDelimiter.c_str(), kServiceName,
553 kDumpMajorDelimiter.c_str(), std::string(33, '=').c_str()),
554 fd) ||
555 !WriteStringToFd(mBoottimeCollection.toString(), fd) ||
556 !WriteStringToFd(StringPrintf("\nWake-up collection information:\n%s\n",
557 std::string(31, '=').c_str()),
558 fd) ||
559 !WriteStringToFd(mWakeUpCollection.toString(), fd) ||
560 !WriteStringToFd(StringPrintf("\nUser-switch collection information:\n%s\n",
561 std::string(35, '=').c_str()),
562 fd) ||
563 !WriteStringToFd(mUserSwitchCollection.toString(), fd) ||
564 !WriteStringToFd(StringPrintf("\nPeriodic collection information:\n%s\n",
565 std::string(32, '=').c_str()),
566 fd) ||
567 !WriteStringToFd(mPeriodicCollection.toString(), fd)) {
568 return Error(FAILED_TRANSACTION)
569 << "Failed to dump the boot-time and periodic collection reports.";
570 }
571
572 for (const auto& processor : mDataProcessors) {
573 if (const auto result = processor->onDump(fd); !result.ok()) {
574 return result;
575 }
576 }
577
578 WriteStringToFd(kDumpMajorDelimiter, fd);
579 return {};
580 }
581
dumpHelpText(int fd) const582 bool WatchdogPerfService::dumpHelpText(int fd) const {
583 return WriteStringToFd(StringPrintf(kHelpText, kServiceName, kStartCustomCollectionFlag,
584 kIntervalFlag,
585 std::chrono::duration_cast<std::chrono::seconds>(
586 kCustomCollectionInterval)
587 .count(),
588 kMaxDurationFlag,
589 std::chrono::duration_cast<std::chrono::minutes>(
590 kCustomCollectionDuration)
591 .count(),
592 kFilterPackagesFlag, kEndCustomCollectionFlag),
593 fd);
594 }
595
dumpCollectorsStatusLocked(int fd) const596 Result<void> WatchdogPerfService::dumpCollectorsStatusLocked(int fd) const {
597 if (!mUidStatsCollector->enabled() &&
598 !WriteStringToFd(StringPrintf("UidStatsCollector failed to access proc and I/O files"),
599 fd)) {
600 return Error() << "Failed to write UidStatsCollector status";
601 }
602 if (!mProcStatCollector->enabled() &&
603 !WriteStringToFd(StringPrintf("ProcStat collector failed to access the file %s",
604 mProcStatCollector->filePath().c_str()),
605 fd)) {
606 return Error() << "Failed to write ProcStat collector status";
607 }
608 return {};
609 }
610
startCustomCollection(std::chrono::nanoseconds interval,std::chrono::nanoseconds maxDuration,const std::unordered_set<std::string> & filterPackages)611 Result<void> WatchdogPerfService::startCustomCollection(
612 std::chrono::nanoseconds interval, std::chrono::nanoseconds maxDuration,
613 const std::unordered_set<std::string>& filterPackages) {
614 if (interval < kMinEventInterval || maxDuration < kMinEventInterval) {
615 return Error(INVALID_OPERATION)
616 << "Collection polling interval and maximum duration must be >= "
617 << std::chrono::duration_cast<std::chrono::milliseconds>(kMinEventInterval).count()
618 << " milliseconds";
619 }
620 Mutex::Autolock lock(mMutex);
621 if (mCurrCollectionEvent == EventType::CUSTOM_COLLECTION) {
622 return Error(INVALID_OPERATION) << "Cannot start custom collection more than once";
623 }
624 nsecs_t now = mHandlerLooper->now();
625 mCustomCollection = {
626 .eventType = EventType::CUSTOM_COLLECTION,
627 .pollingIntervalNs = interval,
628 .lastPollUptimeNs = now,
629 .filterPackages = filterPackages,
630 };
631
632 auto thiz = sp<WatchdogPerfService>::fromExisting(this);
633 mHandlerLooper->removeMessages(thiz);
634 mHandlerLooper->sendMessageAtTime(now + maxDuration.count(), thiz,
635 SwitchMessage::END_CUSTOM_COLLECTION);
636 mCurrCollectionEvent = EventType::CUSTOM_COLLECTION;
637 mHandlerLooper->sendMessage(thiz, EventType::CUSTOM_COLLECTION);
638 ALOGI("Starting %s performance data collection", toString(mCurrCollectionEvent));
639 return {};
640 }
641
endCustomCollection(int fd)642 Result<void> WatchdogPerfService::endCustomCollection(int fd) {
643 Mutex::Autolock lock(mMutex);
644 if (mCurrCollectionEvent != EventType::CUSTOM_COLLECTION) {
645 return Error(INVALID_OPERATION) << "No custom collection is running";
646 }
647
648 auto thiz = sp<WatchdogPerfService>::fromExisting(this);
649 mHandlerLooper->removeMessages(thiz);
650 mHandlerLooper->sendMessage(thiz, SwitchMessage::END_CUSTOM_COLLECTION);
651
652 if (const auto result = dumpCollectorsStatusLocked(fd); !result.ok()) {
653 return Error(FAILED_TRANSACTION) << result.error();
654 }
655
656 if (!WriteStringToFd(StringPrintf("%sPerformance data report for custom collection:\n%s",
657 kDumpMajorDelimiter.c_str(), kDumpMajorDelimiter.c_str()),
658 fd) ||
659 !WriteStringToFd(mCustomCollection.toString(), fd)) {
660 return Error(FAILED_TRANSACTION) << "Failed to write custom collection report.";
661 }
662
663 for (const auto& processor : mDataProcessors) {
664 if (const auto result = processor->onCustomCollectionDump(fd); !result.ok()) {
665 return Error(FAILED_TRANSACTION)
666 << processor->name() << " failed on " << toString(mCurrCollectionEvent)
667 << " collection: " << result.error();
668 }
669 }
670
671 if (DEBUG) {
672 ALOGD("Custom event finished");
673 }
674 WriteStringToFd(kDumpMajorDelimiter, fd);
675 return {};
676 }
677
switchToPeriodicLocked(bool startNow)678 void WatchdogPerfService::switchToPeriodicLocked(bool startNow) {
679 if (mCurrCollectionEvent == EventType::PERIODIC_COLLECTION) {
680 ALOGW("The current performance data collection event is already %s",
681 toString(mCurrCollectionEvent));
682 return;
683 }
684 auto thiz = sp<WatchdogPerfService>::fromExisting(this);
685 mHandlerLooper->removeMessages(thiz);
686 mCurrCollectionEvent = EventType::PERIODIC_COLLECTION;
687 mPeriodicCollection.lastPollUptimeNs = mHandlerLooper->now();
688 if (startNow) {
689 mHandlerLooper->sendMessage(thiz, EventType::PERIODIC_COLLECTION);
690 } else {
691 mPeriodicCollection.lastPollUptimeNs += mPeriodicCollection.pollingIntervalNs.count();
692 mHandlerLooper->sendMessageAtTime(mPeriodicCollection.lastPollUptimeNs, thiz,
693 EventType::PERIODIC_COLLECTION);
694 }
695 mPeriodicMonitor.lastPollUptimeNs =
696 mHandlerLooper->now() + mPeriodicMonitor.pollingIntervalNs.count();
697 mHandlerLooper->sendMessageAtTime(mPeriodicMonitor.lastPollUptimeNs, thiz,
698 EventType::PERIODIC_MONITOR);
699 ALOGI("Switching to %s and %s", toString(mCurrCollectionEvent),
700 toString(EventType::PERIODIC_MONITOR));
701 }
702
handleMessage(const Message & message)703 void WatchdogPerfService::handleMessage(const Message& message) {
704 Result<void> result;
705
706 switch (message.what) {
707 case static_cast<int>(EventType::BOOT_TIME_COLLECTION):
708 result = processCollectionEvent(&mBoottimeCollection);
709 break;
710 case static_cast<int>(SwitchMessage::END_BOOTTIME_COLLECTION):
711 mHandlerLooper->removeMessages(sp<WatchdogPerfService>::fromExisting(this));
712 if (result = processCollectionEvent(&mBoottimeCollection); result.ok()) {
713 Mutex::Autolock lock(mMutex);
714 switchToPeriodicLocked(/*startNow=*/false);
715 }
716 break;
717 case static_cast<int>(EventType::PERIODIC_COLLECTION):
718 result = processCollectionEvent(&mPeriodicCollection);
719 break;
720 case static_cast<int>(EventType::USER_SWITCH_COLLECTION):
721 result = processCollectionEvent(&mUserSwitchCollection);
722 break;
723 case static_cast<int>(EventType::WAKE_UP_COLLECTION):
724 result = processCollectionEvent(&mWakeUpCollection);
725 break;
726 case static_cast<int>(SwitchMessage::END_USER_SWITCH_COLLECTION):
727 case static_cast<int>(SwitchMessage::END_WAKE_UP_COLLECTION): {
728 mHandlerLooper->removeMessages(sp<WatchdogPerfService>::fromExisting(this));
729 EventMetadata* eventMetadata =
730 message.what == static_cast<int>(SwitchMessage::END_USER_SWITCH_COLLECTION)
731 ? &mUserSwitchCollection
732 : &mWakeUpCollection;
733 if (result = processCollectionEvent(eventMetadata); result.ok()) {
734 Mutex::Autolock lock(mMutex);
735 switchToPeriodicLocked(/*startNow=*/false);
736 }
737 break;
738 }
739 case static_cast<int>(EventType::CUSTOM_COLLECTION):
740 result = processCollectionEvent(&mCustomCollection);
741 break;
742 case static_cast<int>(EventType::PERIODIC_MONITOR):
743 result = processMonitorEvent(&mPeriodicMonitor);
744 break;
745 case static_cast<int>(SwitchMessage::END_CUSTOM_COLLECTION): {
746 Mutex::Autolock lock(mMutex);
747 if (EventType expected = EventType::CUSTOM_COLLECTION;
748 mCurrCollectionEvent != expected) {
749 ALOGW("Skipping END_CUSTOM_COLLECTION message as the current collection %s != %s",
750 toString(mCurrCollectionEvent), toString(expected));
751 return;
752 }
753 mCustomCollection = {};
754 for (const auto& processor : mDataProcessors) {
755 /*
756 * Clear custom collection cache on the data processors when the custom collection
757 * ends.
758 */
759 processor->onCustomCollectionDump(-1);
760 }
761 switchToPeriodicLocked(/*startNow=*/true);
762 return;
763 }
764 case static_cast<int>(TaskMessage::SEND_RESOURCE_STATS):
765 result = sendResourceStats();
766 break;
767 default:
768 result = Error() << "Unknown message: " << message.what;
769 }
770
771 if (!result.ok()) {
772 Mutex::Autolock lock(mMutex);
773 ALOGE("Terminating %s: %s", kServiceName, result.error().message().c_str());
774 /*
775 * DO NOT CALL terminate() as it tries to join the collection thread but this code is
776 * executed on the collection thread. Thus it will result in a deadlock.
777 */
778 mCurrCollectionEvent = EventType::TERMINATED;
779 mHandlerLooper->removeMessages(sp<WatchdogPerfService>::fromExisting(this));
780 mHandlerLooper->wake();
781 }
782 }
783
processCollectionEvent(WatchdogPerfService::EventMetadata * metadata)784 Result<void> WatchdogPerfService::processCollectionEvent(
785 WatchdogPerfService::EventMetadata* metadata) {
786 Mutex::Autolock lock(mMutex);
787 /*
788 * Messages sent to the looper are intrinsically racy such that a message from the previous
789 * collection event may land in the looper after the current collection has already begun. Thus
790 * verify the current collection event before starting the collection.
791 */
792 if (mCurrCollectionEvent != metadata->eventType) {
793 ALOGW("Skipping %s event on collection event %s", toString(metadata->eventType),
794 toString(mCurrCollectionEvent));
795 return {};
796 }
797 if (DEBUG) {
798 ALOGD("Processing %s collection event", toString(metadata->eventType));
799 }
800 if (metadata->pollingIntervalNs < kMinEventInterval) {
801 return Error()
802 << "Collection polling interval of "
803 << std::chrono::duration_cast<std::chrono::seconds>(metadata->pollingIntervalNs)
804 .count()
805 << " seconds for " << toString(metadata->eventType)
806 << " collection cannot be less than "
807 << std::chrono::duration_cast<std::chrono::seconds>(kMinEventInterval).count()
808 << " seconds";
809 }
810 if (const auto result = collectLocked(metadata); !result.ok()) {
811 return Error() << toString(metadata->eventType) << " collection failed: " << result.error();
812 }
813 metadata->lastPollUptimeNs += metadata->pollingIntervalNs.count();
814 mHandlerLooper->sendMessageAtTime(metadata->lastPollUptimeNs,
815 sp<WatchdogPerfService>::fromExisting(this),
816 metadata->eventType);
817 return {};
818 }
819
collectLocked(WatchdogPerfService::EventMetadata * metadata)820 Result<void> WatchdogPerfService::collectLocked(WatchdogPerfService::EventMetadata* metadata) {
821 if (!mUidStatsCollector->enabled() && !mProcStatCollector->enabled()) {
822 return Error() << "No collectors enabled";
823 }
824
825 time_t now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
826
827 if (mUidStatsCollector->enabled()) {
828 if (const auto result = mUidStatsCollector->collect(); !result.ok()) {
829 return Error() << "Failed to collect per-uid proc and I/O stats: " << result.error();
830 }
831 }
832
833 if (mProcStatCollector->enabled()) {
834 if (const auto result = mProcStatCollector->collect(); !result.ok()) {
835 return Error() << "Failed to collect proc stats: " << result.error();
836 }
837 }
838
839 ResourceStats resourceStats = {};
840
841 for (const auto& processor : mDataProcessors) {
842 Result<void> result;
843 switch (mCurrCollectionEvent) {
844 case EventType::BOOT_TIME_COLLECTION:
845 result = processor->onBoottimeCollection(now, mUidStatsCollector,
846 mProcStatCollector, &resourceStats);
847 break;
848 case EventType::PERIODIC_COLLECTION:
849 result = processor->onPeriodicCollection(now, mSystemState, mUidStatsCollector,
850 mProcStatCollector, &resourceStats);
851 break;
852 case EventType::USER_SWITCH_COLLECTION: {
853 WatchdogPerfService::UserSwitchEventMetadata* userSwitchMetadata =
854 static_cast<WatchdogPerfService::UserSwitchEventMetadata*>(metadata);
855 result = processor->onUserSwitchCollection(now, userSwitchMetadata->from,
856 userSwitchMetadata->to,
857 mUidStatsCollector, mProcStatCollector);
858 break;
859 }
860 case EventType::WAKE_UP_COLLECTION:
861 result = processor->onWakeUpCollection(now, mUidStatsCollector, mProcStatCollector);
862 break;
863 case EventType::CUSTOM_COLLECTION:
864 result = processor->onCustomCollection(now, mSystemState, metadata->filterPackages,
865 mUidStatsCollector, mProcStatCollector,
866 &resourceStats);
867 break;
868 default:
869 result = Error() << "Invalid collection event " << toString(mCurrCollectionEvent);
870 }
871 if (!result.ok()) {
872 return Error() << processor->name() << " failed on " << toString(mCurrCollectionEvent)
873 << " collection: " << result.error();
874 }
875 }
876
877 if (!isEmpty(resourceStats)) {
878 cacheUnsentResourceStatsLocked(std::move(resourceStats));
879 }
880
881 if (mUnsentResourceStats.empty() || !mWatchdogServiceHelper->isServiceConnected()) {
882 if (DEBUG && !mWatchdogServiceHelper->isServiceConnected()) {
883 ALOGD("Cannot send resource stats since CarWatchdogService not connected.");
884 }
885 return {};
886 }
887
888 // Send message to send resource stats
889 mHandlerLooper->sendMessage(sp<WatchdogPerfService>::fromExisting(this),
890 TaskMessage::SEND_RESOURCE_STATS);
891
892 return {};
893 }
894
processMonitorEvent(WatchdogPerfService::EventMetadata * metadata)895 Result<void> WatchdogPerfService::processMonitorEvent(
896 WatchdogPerfService::EventMetadata* metadata) {
897 if (metadata->eventType != static_cast<int>(EventType::PERIODIC_MONITOR)) {
898 return Error() << "Invalid monitor event " << toString(metadata->eventType);
899 }
900 if (DEBUG) {
901 ALOGD("Processing %s monitor event", toString(metadata->eventType));
902 }
903 if (metadata->pollingIntervalNs < kMinEventInterval) {
904 return Error()
905 << "Monitor polling interval of "
906 << std::chrono::duration_cast<std::chrono::seconds>(metadata->pollingIntervalNs)
907 .count()
908 << " seconds for " << toString(metadata->eventType) << " event cannot be less than "
909 << std::chrono::duration_cast<std::chrono::seconds>(kMinEventInterval).count()
910 << " seconds";
911 }
912 Mutex::Autolock lock(mMutex);
913 if (!mProcDiskStatsCollector->enabled()) {
914 return Error() << "Cannot access proc disk stats for monitoring";
915 }
916 time_t now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
917 if (const auto result = mProcDiskStatsCollector->collect(); !result.ok()) {
918 return Error() << "Failed to collect disk stats: " << result.error();
919 }
920 auto* currCollectionMetadata = getCurrentCollectionMetadataLocked();
921 if (currCollectionMetadata == nullptr) {
922 return Error() << "No metadata available for current collection event: "
923 << toString(mCurrCollectionEvent);
924 }
925 bool requestedCollection = false;
926 auto thiz = sp<WatchdogPerfService>::fromExisting(this);
927 const auto requestCollection = [&]() mutable {
928 if (requestedCollection) {
929 return;
930 }
931 const nsecs_t prevUptimeNs = currCollectionMetadata->lastPollUptimeNs -
932 currCollectionMetadata->pollingIntervalNs.count();
933 nsecs_t uptimeNs = mHandlerLooper->now();
934 if (const auto delta = std::abs(uptimeNs - prevUptimeNs);
935 delta < kMinEventInterval.count()) {
936 return;
937 }
938 currCollectionMetadata->lastPollUptimeNs = uptimeNs;
939 mHandlerLooper->removeMessages(thiz, currCollectionMetadata->eventType);
940 mHandlerLooper->sendMessage(thiz, currCollectionMetadata->eventType);
941 requestedCollection = true;
942 };
943 for (const auto& processor : mDataProcessors) {
944 if (const auto result =
945 processor->onPeriodicMonitor(now, mProcDiskStatsCollector, requestCollection);
946 !result.ok()) {
947 return Error() << processor->name() << " failed on " << toString(metadata->eventType)
948 << ": " << result.error();
949 }
950 }
951 metadata->lastPollUptimeNs += metadata->pollingIntervalNs.count();
952 if (metadata->lastPollUptimeNs == currCollectionMetadata->lastPollUptimeNs) {
953 /*
954 * If the |PERIODIC_MONITOR| and *_COLLECTION events overlap, skip the |PERIODIC_MONITOR|
955 * event.
956 */
957 metadata->lastPollUptimeNs += metadata->pollingIntervalNs.count();
958 }
959 mHandlerLooper->sendMessageAtTime(metadata->lastPollUptimeNs, thiz, metadata->eventType);
960 return {};
961 }
962
sendResourceStats()963 Result<void> WatchdogPerfService::sendResourceStats() {
964 std::vector<ResourceStats> unsentResourceStats = {};
965 {
966 Mutex::Autolock lock(mMutex);
967 nsecs_t now = mHandlerLooper->now();
968 for (auto it = mUnsentResourceStats.begin(); it != mUnsentResourceStats.end();) {
969 if (now - std::get<nsecs_t>(*it) >= kPrevUnsentResourceStatsMaxDurationNs.count()) {
970 // Drop the expired stats
971 it = mUnsentResourceStats.erase(it);
972 continue;
973 }
974 unsentResourceStats.push_back(std::get<ResourceStats>(*it));
975 ++it;
976 }
977 }
978 if (unsentResourceStats.empty()) {
979 return {};
980 }
981 if (auto status = mWatchdogServiceHelper->onLatestResourceStats(unsentResourceStats);
982 !status.isOk()) {
983 ALOGW("Failed to push the unsent resource stats to watchdog service: %s",
984 status.getDescription().c_str());
985 return {};
986 }
987 Mutex::Autolock lock(mMutex);
988 mUnsentResourceStats.clear();
989 if (DEBUG) {
990 ALOGD("Pushed latest resource usage and I/O overuse stats to watchdog service");
991 }
992 return {};
993 }
994
notifySystemStartUpLocked()995 Result<void> WatchdogPerfService::notifySystemStartUpLocked() {
996 for (const auto& processor : mDataProcessors) {
997 if (const auto result = processor->onSystemStartup(); !result.ok()) {
998 ALOGE("%s failed to process system startup event", processor->name().c_str());
999 return Error() << processor->name() << " failed to process system startup event";
1000 }
1001 }
1002 return {};
1003 }
1004
cacheUnsentResourceStatsLocked(ResourceStats resourceStats)1005 void WatchdogPerfService::cacheUnsentResourceStatsLocked(ResourceStats resourceStats) {
1006 mUnsentResourceStats.push_back(
1007 std::make_tuple(mHandlerLooper->now(), std::move(resourceStats)));
1008 if (mUnsentResourceStats.size() > kMaxCachedUnsentResourceStats) {
1009 mUnsentResourceStats.erase(mUnsentResourceStats.begin());
1010 }
1011 }
1012
getCurrentCollectionMetadataLocked()1013 WatchdogPerfService::EventMetadata* WatchdogPerfService::getCurrentCollectionMetadataLocked() {
1014 switch (mCurrCollectionEvent) {
1015 case EventType::BOOT_TIME_COLLECTION:
1016 return &mBoottimeCollection;
1017 case EventType::PERIODIC_COLLECTION:
1018 return &mPeriodicCollection;
1019 case EventType::USER_SWITCH_COLLECTION:
1020 return &mUserSwitchCollection;
1021 case EventType::WAKE_UP_COLLECTION:
1022 return &mWakeUpCollection;
1023 case EventType::CUSTOM_COLLECTION:
1024 return &mCustomCollection;
1025 default:
1026 return nullptr;
1027 }
1028 }
1029
1030 } // namespace watchdog
1031 } // namespace automotive
1032 } // namespace android
1033