• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2020, The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define LOG_TAG "carwatchdogd"
18 #define DEBUG false  // STOPSHIP if true.
19 
20 #include "WatchdogPerfService.h"
21 
22 #include <android-base/file.h>
23 #include <android-base/parseint.h>
24 #include <android-base/stringprintf.h>
25 #include <android-base/strings.h>
26 #include <log/log.h>
27 #include <processgroup/sched_policy.h>
28 
29 #include <pthread.h>
30 
31 #include <iterator>
32 #include <vector>
33 
34 namespace android {
35 namespace automotive {
36 namespace watchdog {
37 
38 namespace {
39 
40 using ::android::sp;
41 using ::android::String16;
42 using ::android::String8;
43 using ::android::automotive::watchdog::internal::PowerCycle;
44 using ::android::automotive::watchdog::internal::UserState;
45 using ::android::base::Error;
46 using ::android::base::Join;
47 using ::android::base::ParseUint;
48 using ::android::base::Result;
49 using ::android::base::Split;
50 using ::android::base::StringAppendF;
51 using ::android::base::StringPrintf;
52 using ::android::base::WriteStringToFd;
53 
54 // Minimum required collection interval between subsequent collections.
55 const std::chrono::nanoseconds kMinEventInterval = 1s;
56 const std::chrono::seconds kDefaultSystemEventCollectionInterval = 1s;
57 const std::chrono::seconds kDefaultPeriodicCollectionInterval = 20s;
58 const std::chrono::seconds kDefaultPeriodicMonitorInterval = 5s;
59 const std::chrono::nanoseconds kCustomCollectionInterval = 10s;
60 const std::chrono::nanoseconds kCustomCollectionDuration = 30min;
61 
62 constexpr const char* kServiceName = "WatchdogPerfService";
63 static const std::string kDumpMajorDelimiter = std::string(100, '-') + "\n";  // NOLINT
64 constexpr const char* kHelpText =
65         "\n%s dump options:\n"
66         "%s: Starts custom performance data collection. Customize the collection behavior with "
67         "the following optional arguments:\n"
68         "\t%s <seconds>: Modifies the collection interval. Default behavior is to collect once "
69         "every %lld seconds.\n"
70         "\t%s <seconds>: Modifies the maximum collection duration. Default behavior is to collect "
71         "until %ld minutes before automatically stopping the custom collection and discarding "
72         "the collected data.\n"
73         "\t%s <package name>,<package name>,...: Comma-separated value containing package names. "
74         "When provided, the results are filtered only to the provided package names. Default "
75         "behavior is to list the results for the top N packages.\n"
76         "%s: Stops custom performance data collection and generates a dump of "
77         "the collection report.\n\n"
78         "When no options are specified, the car watchdog report contains the performance data "
79         "collected during boot-time and over the last few minutes before the report generation.\n";
80 
parseSecondsFlag(const Vector<String16> & args,size_t pos)81 Result<std::chrono::seconds> parseSecondsFlag(const Vector<String16>& args, size_t pos) {
82     if (args.size() <= pos) {
83         return Error() << "Value not provided";
84     }
85     uint64_t value;
86     if (std::string strValue = std::string(String8(args[pos]).string());
87         !ParseUint(strValue, &value)) {
88         return Error() << "Invalid value " << strValue << ", must be an integer";
89     }
90     return std::chrono::seconds(value);
91 }
92 
toString(std::variant<EventType,SwitchMessage> what)93 constexpr const char* toString(std::variant<EventType, SwitchMessage> what) {
94     return std::visit(
95             [&](const auto& v) -> const char* {
96                 switch (static_cast<int>(v)) {
97                     case EventType::INIT:
98                         return "INIT";
99                     case EventType::TERMINATED:
100                         return "TERMINATED";
101                     case EventType::BOOT_TIME_COLLECTION:
102                         return "BOOT_TIME_COLLECTION";
103                     case EventType::PERIODIC_COLLECTION:
104                         return "PERIODIC_COLLECTION";
105                     case EventType::USER_SWITCH_COLLECTION:
106                         return "USER_SWITCH_COLLECTION";
107                     case EventType::WAKE_UP_COLLECTION:
108                         return "WAKE_UP_COLLECTION";
109                     case EventType::CUSTOM_COLLECTION:
110                         return "CUSTOM_COLLECTION";
111                     case EventType::PERIODIC_MONITOR:
112                         return "PERIODIC_MONITOR";
113                     case EventType::LAST_EVENT:
114                         return "LAST_EVENT";
115                     case SwitchMessage::END_BOOTTIME_COLLECTION:
116                         return "END_BOOTTIME_COLLECTION";
117                     case SwitchMessage::END_USER_SWITCH_COLLECTION:
118                         return "END_USER_SWITCH_COLLECTION";
119                     case SwitchMessage::END_WAKE_UP_COLLECTION:
120                         return "END_WAKE_UP_COLLECTION";
121                     case SwitchMessage::END_CUSTOM_COLLECTION:
122                         return "END_CUSTOM_COLLECTION";
123                     default:
124                         return "INVALID_EVENT_OR_SWITCH_MESSAGE";
125                 }
126             },
127             what);
128 }
129 
toString(SystemState systemState)130 constexpr const char* toString(SystemState systemState) {
131     switch (systemState) {
132         case SystemState::NORMAL_MODE:
133             return "NORMAL_MODE";
134         case SystemState::GARAGE_MODE:
135             return "GARAGE_MODE";
136         default:
137             return "UNKNOWN MODE";
138     }
139 }
140 
141 }  // namespace
142 
toString() const143 std::string WatchdogPerfService::EventMetadata::toString() const {
144     std::string buffer;
145     const auto intervalInSecs = std::chrono::duration_cast<std::chrono::seconds>(interval).count();
146     StringAppendF(&buffer, "Event interval: %lld second%s\n", intervalInSecs,
147                   ((intervalInSecs > 1) ? "s" : ""));
148     if (!filterPackages.empty()) {
149         std::vector<std::string> packages(filterPackages.begin(), filterPackages.end());
150         StringAppendF(&buffer, "Filtered results to packages: %s\n", Join(packages, ", ").c_str());
151     }
152     return buffer;
153 }
154 
registerDataProcessor(sp<DataProcessorInterface> processor)155 Result<void> WatchdogPerfService::registerDataProcessor(sp<DataProcessorInterface> processor) {
156     if (processor == nullptr) {
157         return Error() << "Must provide a valid data processor";
158     }
159     if (const auto result = processor->init(); !result.ok()) {
160         return Error() << "Failed to initialize " << processor->name().c_str() << ": "
161                        << result.error().message();
162     }
163     Mutex::Autolock lock(mMutex);
164     mDataProcessors.push_back(processor);
165     if (DEBUG) {
166         ALOGD("Successfully registered %s to %s", processor->name().c_str(), kServiceName);
167     }
168     return {};
169 }
170 
start()171 Result<void> WatchdogPerfService::start() {
172     {
173         Mutex::Autolock lock(mMutex);
174         if (mCurrCollectionEvent != EventType::INIT || mCollectionThread.joinable()) {
175             return Error(INVALID_OPERATION) << "Cannot start " << kServiceName << " more than once";
176         }
177         std::chrono::nanoseconds systemEventCollectionInterval =
178                 std::chrono::duration_cast<std::chrono::nanoseconds>(
179                         std::chrono::seconds(sysprop::systemEventCollectionInterval().value_or(
180                                 kDefaultSystemEventCollectionInterval.count())));
181         std::chrono::nanoseconds periodicCollectionInterval =
182                 std::chrono::duration_cast<std::chrono::nanoseconds>(
183                         std::chrono::seconds(sysprop::periodicCollectionInterval().value_or(
184                                 kDefaultPeriodicCollectionInterval.count())));
185         std::chrono::nanoseconds periodicMonitorInterval =
186                 std::chrono::duration_cast<std::chrono::nanoseconds>(
187                         std::chrono::seconds(sysprop::periodicMonitorInterval().value_or(
188                                 kDefaultPeriodicMonitorInterval.count())));
189         mBoottimeCollection = {
190                 .eventType = EventType::BOOT_TIME_COLLECTION,
191                 .interval = systemEventCollectionInterval,
192                 .lastUptime = 0,
193         };
194         mPeriodicCollection = {
195                 .eventType = EventType::PERIODIC_COLLECTION,
196                 .interval = periodicCollectionInterval,
197                 .lastUptime = 0,
198         };
199         mUserSwitchCollection = {{
200                 .eventType = EventType::USER_SWITCH_COLLECTION,
201                 .interval = systemEventCollectionInterval,
202                 .lastUptime = 0,
203         }};
204         mWakeUpCollection = {
205                 .eventType = EventType::WAKE_UP_COLLECTION,
206                 .interval = systemEventCollectionInterval,
207                 .lastUptime = 0,
208         };
209         mPeriodicMonitor = {
210                 .eventType = EventType::PERIODIC_MONITOR,
211                 .interval = periodicMonitorInterval,
212                 .lastUptime = 0,
213         };
214         if (mDataProcessors.empty()) {
215             ALOGE("Terminating %s: No data processor is registered", kServiceName);
216             mCurrCollectionEvent = EventType::TERMINATED;
217             return Error() << "No data processor is registered";
218         }
219         mUidStatsCollector->init();
220         mProcStatCollector->init();
221         mProcDiskStatsCollector->init();
222     }
223 
224     mCollectionThread = std::thread([&]() {
225         {
226             Mutex::Autolock lock(mMutex);
227             if (EventType expected = EventType::INIT; mCurrCollectionEvent != expected) {
228                 ALOGE("Skipping performance data collection as the current collection event "
229                       "%s != %s",
230                       toString(mCurrCollectionEvent), toString(expected));
231                 return;
232             }
233             notifySystemStartUpLocked();
234             mCurrCollectionEvent = EventType::BOOT_TIME_COLLECTION;
235             mBoottimeCollection.lastUptime = mHandlerLooper->now();
236             mHandlerLooper->setLooper(Looper::prepare(/*opts=*/0));
237             mHandlerLooper->sendMessage(sp<WatchdogPerfService>::fromExisting(this),
238                                         EventType::BOOT_TIME_COLLECTION);
239         }
240         if (set_sched_policy(0, SP_BACKGROUND) != 0) {
241             ALOGW("Failed to set background scheduling priority to %s thread", kServiceName);
242         }
243         if (int result = pthread_setname_np(pthread_self(), "WatchdogPerfSvc"); result != 0) {
244             ALOGE("Failed to set %s thread name: %d", kServiceName, result);
245         }
246         ALOGI("Starting %s performance data collection", toString(mCurrCollectionEvent));
247         bool isCollectionActive = true;
248         /*
249          * Loop until the collection is not active -- performance collection runs on this thread in
250          * a handler.
251          */
252         while (isCollectionActive) {
253             mHandlerLooper->pollAll(/*timeoutMillis=*/-1);
254             Mutex::Autolock lock(mMutex);
255             isCollectionActive = mCurrCollectionEvent != EventType::TERMINATED;
256         }
257     });
258     return {};
259 }
260 
terminate()261 void WatchdogPerfService::terminate() {
262     {
263         Mutex::Autolock lock(mMutex);
264         if (mCurrCollectionEvent == EventType::TERMINATED) {
265             ALOGE("%s was terminated already", kServiceName);
266             return;
267         }
268         ALOGE("Terminating %s as car watchdog is terminating", kServiceName);
269         if (mCurrCollectionEvent != EventType::INIT) {
270             /*
271              * Looper runs only after EventType::INIT has completed so remove looper messages
272              * and wake the looper only when the current collection has changed from INIT.
273              */
274             mHandlerLooper->removeMessages(sp<WatchdogPerfService>::fromExisting(this));
275             mHandlerLooper->wake();
276         }
277         for (const auto& processor : mDataProcessors) {
278             processor->terminate();
279         }
280         mCurrCollectionEvent = EventType::TERMINATED;
281     }
282     if (mCollectionThread.joinable()) {
283         mCollectionThread.join();
284         if (DEBUG) {
285             ALOGD("%s collection thread terminated", kServiceName);
286         }
287     }
288 }
289 
setSystemState(SystemState systemState)290 void WatchdogPerfService::setSystemState(SystemState systemState) {
291     Mutex::Autolock lock(mMutex);
292     if (mSystemState != systemState) {
293         ALOGI("%s switching from %s to %s", kServiceName, toString(mSystemState),
294               toString(systemState));
295     }
296     mSystemState = systemState;
297 }
298 
onBootFinished()299 Result<void> WatchdogPerfService::onBootFinished() {
300     Mutex::Autolock lock(mMutex);
301     if (EventType expected = EventType::BOOT_TIME_COLLECTION; mCurrCollectionEvent != expected) {
302         /*
303          * This case happens when either the WatchdogPerfService has prematurely terminated before
304          * boot complete notification is received or multiple boot complete notifications are
305          * received. In either case don't return error as this will lead to runtime exception and
306          * cause system to boot loop.
307          */
308         ALOGE("Current performance data collection event %s != %s", toString(mCurrCollectionEvent),
309               toString(expected));
310         return {};
311     }
312     nsecs_t endBootCollectionTime = mHandlerLooper->now() + mPostSystemEventDurationNs.count();
313     mHandlerLooper->sendMessageAtTime(endBootCollectionTime,
314                                       sp<WatchdogPerfService>::fromExisting(this),
315                                       SwitchMessage::END_BOOTTIME_COLLECTION);
316     if (DEBUG) {
317         ALOGD("Boot complete signal received.");
318     }
319     return {};
320 }
321 
onUserStateChange(userid_t userId,const UserState & userState)322 Result<void> WatchdogPerfService::onUserStateChange(userid_t userId, const UserState& userState) {
323     Mutex::Autolock lock(mMutex);
324     if (mCurrCollectionEvent == EventType::BOOT_TIME_COLLECTION ||
325         mCurrCollectionEvent == EventType::CUSTOM_COLLECTION) {
326         // Ignoring the user switch events because the boot-time and custom collections take
327         // precedence over other collections.
328         return {};
329     }
330     switch (static_cast<int>(userState)) {
331         case static_cast<int>(UserState::USER_STATE_SWITCHING):
332             // TODO(b/243984863): Handle multi-user switching scenario.
333             mUserSwitchCollection.from = mUserSwitchCollection.to;
334             mUserSwitchCollection.to = userId;
335             if (mCurrCollectionEvent != EventType::PERIODIC_COLLECTION &&
336                 mCurrCollectionEvent != EventType::USER_SWITCH_COLLECTION) {
337                 ALOGE("Unable to start %s. Current performance data collection event: %s",
338                       toString(EventType::USER_SWITCH_COLLECTION), toString(mCurrCollectionEvent));
339                 return {};
340             }
341             startUserSwitchCollection();
342             ALOGI("Switching to %s (userIds: from = %d, to = %d)", toString(mCurrCollectionEvent),
343                   mUserSwitchCollection.from, mUserSwitchCollection.to);
344             break;
345         case static_cast<int>(UserState::USER_STATE_UNLOCKING):
346             if (mCurrCollectionEvent != EventType::PERIODIC_COLLECTION) {
347                 if (mCurrCollectionEvent != EventType::USER_SWITCH_COLLECTION) {
348                     ALOGE("Unable to start %s. Current performance data collection event: %s",
349                           toString(EventType::USER_SWITCH_COLLECTION),
350                           toString(mCurrCollectionEvent));
351                 }
352                 return {};
353             }
354             if (mUserSwitchCollection.to != userId) {
355                 return {};
356             }
357             startUserSwitchCollection();
358             ALOGI("Switching to %s (userId: %d)", toString(mCurrCollectionEvent), userId);
359             break;
360         case static_cast<int>(UserState::USER_STATE_POST_UNLOCKED): {
361             if (mCurrCollectionEvent != EventType::USER_SWITCH_COLLECTION) {
362                 ALOGE("Ignoring USER_STATE_POST_UNLOCKED because no user switch collection in "
363                       "progress. Current performance data collection event: %s.",
364                       toString(mCurrCollectionEvent));
365                 return {};
366             }
367             if (mUserSwitchCollection.to != userId) {
368                 ALOGE("Ignoring USER_STATE_POST_UNLOCKED signal for user id: %d. "
369                       "Current user being switched to: %d",
370                       userId, mUserSwitchCollection.to);
371                 return {};
372             }
373             auto thiz = sp<WatchdogPerfService>::fromExisting(this);
374             mHandlerLooper->removeMessages(thiz, SwitchMessage::END_USER_SWITCH_COLLECTION);
375             nsecs_t endUserSwitchCollectionTime =
376                     mHandlerLooper->now() + mPostSystemEventDurationNs.count();
377             mHandlerLooper->sendMessageAtTime(endUserSwitchCollectionTime, thiz,
378                                               SwitchMessage::END_USER_SWITCH_COLLECTION);
379             break;
380         }
381         default:
382             ALOGE("Unsupported user state: %d", static_cast<int>(userState));
383             return {};
384     }
385     if (DEBUG) {
386         ALOGD("Handled user state change: userId = %d, userState = %d", userId,
387               static_cast<int>(userState));
388     }
389     return {};
390 }
391 
startUserSwitchCollection()392 Result<void> WatchdogPerfService::startUserSwitchCollection() {
393     auto thiz = sp<WatchdogPerfService>::fromExisting(this);
394     mHandlerLooper->removeMessages(thiz);
395     mUserSwitchCollection.lastUptime = mHandlerLooper->now();
396     // End |EventType::USER_SWITCH_COLLECTION| after a timeout because the user switch end
397     // signal won't be received within a few seconds when the switch is blocked due to a
398     // keyguard event. Otherwise, polling beyond a few seconds will lead to unnecessary data
399     // collection.
400     nsecs_t uptime = mHandlerLooper->now() + mUserSwitchTimeoutNs.count();
401     mHandlerLooper->sendMessageAtTime(uptime, thiz, SwitchMessage::END_USER_SWITCH_COLLECTION);
402     mCurrCollectionEvent = EventType::USER_SWITCH_COLLECTION;
403     mHandlerLooper->sendMessage(thiz, EventType::USER_SWITCH_COLLECTION);
404     return {};
405 }
406 
onSuspendExit()407 Result<void> WatchdogPerfService::onSuspendExit() {
408     Mutex::Autolock lock(mMutex);
409     if (mCurrCollectionEvent == EventType::CUSTOM_COLLECTION) {
410         // Ignoring the suspend exit event because the custom collection takes
411         // precedence over other collections.
412         ALOGE("Unable to start %s. Current performance data collection event: %s",
413               toString(EventType::WAKE_UP_COLLECTION), toString(mCurrCollectionEvent));
414         return {};
415     }
416     if (mCurrCollectionEvent == EventType::WAKE_UP_COLLECTION) {
417         ALOGE("The current performance data collection event is already %s",
418               toString(EventType::WAKE_UP_COLLECTION));
419         return {};
420     }
421     notifySystemStartUpLocked();
422     auto thiz = sp<WatchdogPerfService>::fromExisting(this);
423     mHandlerLooper->removeMessages(thiz);
424     mWakeUpCollection.lastUptime = mHandlerLooper->now();
425     nsecs_t uptime = mHandlerLooper->now() + mWakeUpDurationNs.count();
426     mHandlerLooper->sendMessageAtTime(uptime, thiz, SwitchMessage::END_WAKE_UP_COLLECTION);
427     mCurrCollectionEvent = EventType::WAKE_UP_COLLECTION;
428     mHandlerLooper->sendMessage(thiz, EventType::WAKE_UP_COLLECTION);
429     ALOGI("Switching to %s", toString(mCurrCollectionEvent));
430     return {};
431 }
432 
onShutdownEnter()433 Result<void> WatchdogPerfService::onShutdownEnter() {
434     Mutex::Autolock lock(mMutex);
435     if (mCurrCollectionEvent == EventType::CUSTOM_COLLECTION) {
436         ALOGI("Unable to switch to %s during shutdown enter. Current performance data collection "
437               "event: %s",
438               toString(EventType::PERIODIC_COLLECTION), toString(mCurrCollectionEvent));
439         return {};
440     }
441     switchToPeriodicLocked(/*startNow=*/true);
442     return {};
443 }
444 
onCustomCollection(int fd,const Vector<String16> & args)445 Result<void> WatchdogPerfService::onCustomCollection(int fd, const Vector<String16>& args) {
446     if (args.empty()) {
447         return Error(BAD_VALUE) << "No custom collection dump arguments";
448     }
449 
450     if (args[0] == String16(kStartCustomCollectionFlag)) {
451         if (args.size() > 7) {
452             return Error(BAD_VALUE) << "Number of arguments to start custom performance data "
453                                     << "collection cannot exceed 7";
454         }
455         std::chrono::nanoseconds interval = kCustomCollectionInterval;
456         std::chrono::nanoseconds maxDuration = kCustomCollectionDuration;
457         std::unordered_set<std::string> filterPackages;
458         for (size_t i = 1; i < args.size(); ++i) {
459             if (args[i] == String16(kIntervalFlag)) {
460                 const auto& result = parseSecondsFlag(args, i + 1);
461                 if (!result.ok()) {
462                     return Error(BAD_VALUE)
463                             << "Failed to parse " << kIntervalFlag << ": " << result.error();
464                 }
465                 interval = std::chrono::duration_cast<std::chrono::nanoseconds>(*result);
466                 ++i;
467                 continue;
468             }
469             if (args[i] == String16(kMaxDurationFlag)) {
470                 const auto& result = parseSecondsFlag(args, i + 1);
471                 if (!result.ok()) {
472                     return Error(BAD_VALUE)
473                             << "Failed to parse " << kMaxDurationFlag << ": " << result.error();
474                 }
475                 maxDuration = std::chrono::duration_cast<std::chrono::nanoseconds>(*result);
476                 ++i;
477                 continue;
478             }
479             if (args[i] == String16(kFilterPackagesFlag)) {
480                 if (args.size() < i + 1) {
481                     return Error(BAD_VALUE)
482                             << "Must provide value for '" << kFilterPackagesFlag << "' flag";
483                 }
484                 std::vector<std::string> packages =
485                         Split(std::string(String8(args[i + 1]).string()), ",");
486                 std::copy(packages.begin(), packages.end(),
487                           std::inserter(filterPackages, filterPackages.end()));
488                 ++i;
489                 continue;
490             }
491             ALOGW("Unknown flag %s provided to start custom performance data collection",
492                   String8(args[i]).string());
493             return Error(BAD_VALUE) << "Unknown flag " << String8(args[i]).string()
494                                     << " provided to start custom performance data collection";
495         }
496         if (const auto& result = startCustomCollection(interval, maxDuration, filterPackages);
497             !result.ok()) {
498             WriteStringToFd(result.error().message(), fd);
499             return result;
500         }
501         return {};
502     }
503 
504     if (args[0] == String16(kEndCustomCollectionFlag)) {
505         if (args.size() != 1) {
506             ALOGW("Number of arguments to stop custom performance data collection cannot exceed 1. "
507                   "Stopping the data collection.");
508             WriteStringToFd("Number of arguments to stop custom performance data collection "
509                             "cannot exceed 1. Stopping the data collection.",
510                             fd);
511         }
512         return endCustomCollection(fd);
513     }
514 
515     return Error(BAD_VALUE) << "Custom perf collection dump arguments start neither with "
516                             << kStartCustomCollectionFlag << " nor with "
517                             << kEndCustomCollectionFlag << " flags";
518 }
519 
onDump(int fd) const520 Result<void> WatchdogPerfService::onDump(int fd) const {
521     Mutex::Autolock lock(mMutex);
522     if (mCurrCollectionEvent == EventType::TERMINATED) {
523         ALOGW("%s not active. Dumping cached data", kServiceName);
524         if (!WriteStringToFd(StringPrintf("%s not active. Dumping cached data.", kServiceName),
525                              fd)) {
526             return Error(FAILED_TRANSACTION) << "Failed to write " << kServiceName << " status";
527         }
528     }
529 
530     if (const auto& result = dumpCollectorsStatusLocked(fd); !result.ok()) {
531         return Error(FAILED_TRANSACTION) << result.error();
532     }
533 
534     if (!WriteStringToFd(StringPrintf("\n%s%s report:\n%sBoot-time collection information:\n%s\n",
535                                       kDumpMajorDelimiter.c_str(), kServiceName,
536                                       kDumpMajorDelimiter.c_str(), std::string(33, '=').c_str()),
537                          fd) ||
538         !WriteStringToFd(mBoottimeCollection.toString(), fd) ||
539         !WriteStringToFd(StringPrintf("\nWake-up collection information:\n%s\n",
540                                       std::string(31, '=').c_str()),
541                          fd) ||
542         !WriteStringToFd(mWakeUpCollection.toString(), fd) ||
543         !WriteStringToFd(StringPrintf("\nUser-switch collection information:\n%s\n",
544                                       std::string(35, '=').c_str()),
545                          fd) ||
546         !WriteStringToFd(mUserSwitchCollection.toString(), fd) ||
547         !WriteStringToFd(StringPrintf("\nPeriodic collection information:\n%s\n",
548                                       std::string(32, '=').c_str()),
549                          fd) ||
550         !WriteStringToFd(mPeriodicCollection.toString(), fd)) {
551         return Error(FAILED_TRANSACTION)
552                 << "Failed to dump the boot-time and periodic collection reports.";
553     }
554 
555     for (const auto& processor : mDataProcessors) {
556         if (const auto result = processor->onDump(fd); !result.ok()) {
557             return result;
558         }
559     }
560 
561     WriteStringToFd(kDumpMajorDelimiter, fd);
562     return {};
563 }
564 
dumpHelpText(int fd) const565 bool WatchdogPerfService::dumpHelpText(int fd) const {
566     return WriteStringToFd(StringPrintf(kHelpText, kServiceName, kStartCustomCollectionFlag,
567                                         kIntervalFlag,
568                                         std::chrono::duration_cast<std::chrono::seconds>(
569                                                 kCustomCollectionInterval)
570                                                 .count(),
571                                         kMaxDurationFlag,
572                                         std::chrono::duration_cast<std::chrono::minutes>(
573                                                 kCustomCollectionDuration)
574                                                 .count(),
575                                         kFilterPackagesFlag, kEndCustomCollectionFlag),
576                            fd);
577 }
578 
dumpCollectorsStatusLocked(int fd) const579 Result<void> WatchdogPerfService::dumpCollectorsStatusLocked(int fd) const {
580     if (!mUidStatsCollector->enabled() &&
581         !WriteStringToFd(StringPrintf("UidStatsCollector failed to access proc and I/O files"),
582                          fd)) {
583         return Error() << "Failed to write UidStatsCollector status";
584     }
585     if (!mProcStatCollector->enabled() &&
586         !WriteStringToFd(StringPrintf("ProcStat collector failed to access the file %s",
587                                       mProcStatCollector->filePath().c_str()),
588                          fd)) {
589         return Error() << "Failed to write ProcStat collector status";
590     }
591     return {};
592 }
593 
startCustomCollection(std::chrono::nanoseconds interval,std::chrono::nanoseconds maxDuration,const std::unordered_set<std::string> & filterPackages)594 Result<void> WatchdogPerfService::startCustomCollection(
595         std::chrono::nanoseconds interval, std::chrono::nanoseconds maxDuration,
596         const std::unordered_set<std::string>& filterPackages) {
597     if (interval < kMinEventInterval || maxDuration < kMinEventInterval) {
598         return Error(INVALID_OPERATION)
599                 << "Collection interval and maximum duration must be >= "
600                 << std::chrono::duration_cast<std::chrono::milliseconds>(kMinEventInterval).count()
601                 << " milliseconds.";
602     }
603     Mutex::Autolock lock(mMutex);
604     if (EventType expected = EventType::PERIODIC_COLLECTION; mCurrCollectionEvent != expected) {
605         return Error(INVALID_OPERATION)
606                 << "Cannot start a custom collection when the current collection event "
607                 << toString(mCurrCollectionEvent) << " != " << toString(expected)
608                 << " collection event";
609     }
610 
611     mCustomCollection = {
612             .eventType = EventType::CUSTOM_COLLECTION,
613             .interval = interval,
614             .lastUptime = mHandlerLooper->now(),
615             .filterPackages = filterPackages,
616     };
617 
618     auto thiz = sp<WatchdogPerfService>::fromExisting(this);
619     mHandlerLooper->removeMessages(thiz);
620     nsecs_t uptime = mHandlerLooper->now() + maxDuration.count();
621     mHandlerLooper->sendMessageAtTime(uptime, thiz, SwitchMessage::END_CUSTOM_COLLECTION);
622     mCurrCollectionEvent = EventType::CUSTOM_COLLECTION;
623     mHandlerLooper->sendMessage(thiz, EventType::CUSTOM_COLLECTION);
624     ALOGI("Starting %s performance data collection", toString(mCurrCollectionEvent));
625     return {};
626 }
627 
endCustomCollection(int fd)628 Result<void> WatchdogPerfService::endCustomCollection(int fd) {
629     Mutex::Autolock lock(mMutex);
630     if (mCurrCollectionEvent != EventType::CUSTOM_COLLECTION) {
631         return Error(INVALID_OPERATION) << "No custom collection is running";
632     }
633 
634     auto thiz = sp<WatchdogPerfService>::fromExisting(this);
635     mHandlerLooper->removeMessages(thiz);
636     mHandlerLooper->sendMessage(thiz, SwitchMessage::END_CUSTOM_COLLECTION);
637 
638     if (const auto result = dumpCollectorsStatusLocked(fd); !result.ok()) {
639         return Error(FAILED_TRANSACTION) << result.error();
640     }
641 
642     if (!WriteStringToFd(StringPrintf("%sPerformance data report for custom collection:\n%s",
643                                       kDumpMajorDelimiter.c_str(), kDumpMajorDelimiter.c_str()),
644                          fd) ||
645         !WriteStringToFd(mCustomCollection.toString(), fd)) {
646         return Error(FAILED_TRANSACTION) << "Failed to write custom collection report.";
647     }
648 
649     for (const auto& processor : mDataProcessors) {
650         if (const auto result = processor->onCustomCollectionDump(fd); !result.ok()) {
651             return Error() << processor->name() << " failed on " << toString(mCurrCollectionEvent)
652                            << " collection: " << result.error();
653         }
654     }
655 
656     if (DEBUG) {
657         ALOGD("Custom event finished");
658     }
659     WriteStringToFd(kDumpMajorDelimiter, fd);
660     return {};
661 }
662 
switchToPeriodicLocked(bool startNow)663 void WatchdogPerfService::switchToPeriodicLocked(bool startNow) {
664     if (mCurrCollectionEvent == EventType::PERIODIC_COLLECTION) {
665         ALOGW("The current performance data collection event is already %s",
666               toString(mCurrCollectionEvent));
667         return;
668     }
669     auto thiz = sp<WatchdogPerfService>::fromExisting(this);
670     mHandlerLooper->removeMessages(thiz);
671     mCurrCollectionEvent = EventType::PERIODIC_COLLECTION;
672     mPeriodicCollection.lastUptime = mHandlerLooper->now();
673     if (startNow) {
674         mHandlerLooper->sendMessage(thiz, EventType::PERIODIC_COLLECTION);
675     } else {
676         mPeriodicCollection.lastUptime += mPeriodicCollection.interval.count();
677         mHandlerLooper->sendMessageAtTime(mPeriodicCollection.lastUptime, thiz,
678                                           EventType::PERIODIC_COLLECTION);
679     }
680     mPeriodicMonitor.lastUptime = mHandlerLooper->now() + mPeriodicMonitor.interval.count();
681     mHandlerLooper->sendMessageAtTime(mPeriodicMonitor.lastUptime, thiz,
682                                       EventType::PERIODIC_MONITOR);
683     ALOGI("Switching to %s and %s", toString(mCurrCollectionEvent),
684           toString(EventType::PERIODIC_MONITOR));
685 }
686 
handleMessage(const Message & message)687 void WatchdogPerfService::handleMessage(const Message& message) {
688     Result<void> result;
689 
690     switch (message.what) {
691         case static_cast<int>(EventType::BOOT_TIME_COLLECTION):
692             result = processCollectionEvent(&mBoottimeCollection);
693             break;
694         case static_cast<int>(SwitchMessage::END_BOOTTIME_COLLECTION):
695             mHandlerLooper->removeMessages(sp<WatchdogPerfService>::fromExisting(this));
696             if (result = processCollectionEvent(&mBoottimeCollection); result.ok()) {
697                 Mutex::Autolock lock(mMutex);
698                 switchToPeriodicLocked(/*startNow=*/false);
699             }
700             break;
701         case static_cast<int>(EventType::PERIODIC_COLLECTION):
702             result = processCollectionEvent(&mPeriodicCollection);
703             break;
704         case static_cast<int>(EventType::USER_SWITCH_COLLECTION):
705             result = processCollectionEvent(&mUserSwitchCollection);
706             break;
707         case static_cast<int>(EventType::WAKE_UP_COLLECTION):
708             result = processCollectionEvent(&mWakeUpCollection);
709             break;
710         case static_cast<int>(SwitchMessage::END_USER_SWITCH_COLLECTION):
711         case static_cast<int>(SwitchMessage::END_WAKE_UP_COLLECTION): {
712             mHandlerLooper->removeMessages(sp<WatchdogPerfService>::fromExisting(this));
713             EventMetadata* eventMetadata =
714                     message.what == static_cast<int>(SwitchMessage::END_USER_SWITCH_COLLECTION)
715                     ? &mUserSwitchCollection
716                     : &mWakeUpCollection;
717             if (result = processCollectionEvent(eventMetadata); result.ok()) {
718                 Mutex::Autolock lock(mMutex);
719                 switchToPeriodicLocked(/*startNow=*/false);
720             }
721             break;
722         }
723         case static_cast<int>(EventType::CUSTOM_COLLECTION):
724             result = processCollectionEvent(&mCustomCollection);
725             break;
726         case static_cast<int>(EventType::PERIODIC_MONITOR):
727             result = processMonitorEvent(&mPeriodicMonitor);
728             break;
729         case static_cast<int>(SwitchMessage::END_CUSTOM_COLLECTION): {
730             Mutex::Autolock lock(mMutex);
731             if (EventType expected = EventType::CUSTOM_COLLECTION;
732                 mCurrCollectionEvent != expected) {
733                 ALOGW("Skipping END_CUSTOM_COLLECTION message as the current collection %s != %s",
734                       toString(mCurrCollectionEvent), toString(expected));
735                 return;
736             }
737             mCustomCollection = {};
738             for (const auto& processor : mDataProcessors) {
739                 /*
740                  * Clear custom collection cache on the data processors when the custom collection
741                  * ends.
742                  */
743                 processor->onCustomCollectionDump(-1);
744             }
745             switchToPeriodicLocked(/*startNow=*/true);
746             return;
747         }
748         default:
749             result = Error() << "Unknown message: " << message.what;
750     }
751 
752     if (!result.ok()) {
753         Mutex::Autolock lock(mMutex);
754         ALOGE("Terminating %s: %s", kServiceName, result.error().message().c_str());
755         /*
756          * DO NOT CALL terminate() as it tries to join the collection thread but this code is
757          * executed on the collection thread. Thus it will result in a deadlock.
758          */
759         mCurrCollectionEvent = EventType::TERMINATED;
760         mHandlerLooper->removeMessages(sp<WatchdogPerfService>::fromExisting(this));
761         mHandlerLooper->wake();
762     }
763 }
764 
processCollectionEvent(WatchdogPerfService::EventMetadata * metadata)765 Result<void> WatchdogPerfService::processCollectionEvent(
766         WatchdogPerfService::EventMetadata* metadata) {
767     Mutex::Autolock lock(mMutex);
768     /*
769      * Messages sent to the looper are intrinsically racy such that a message from the previous
770      * collection event may land in the looper after the current collection has already begun. Thus
771      * verify the current collection event before starting the collection.
772      */
773     if (mCurrCollectionEvent != metadata->eventType) {
774         ALOGW("Skipping %s event on collection event %s", toString(metadata->eventType),
775               toString(mCurrCollectionEvent));
776         return {};
777     }
778     if (DEBUG) {
779         ALOGD("Processing %s collection event", toString(metadata->eventType));
780     }
781     if (metadata->interval < kMinEventInterval) {
782         return Error()
783                 << "Collection interval of "
784                 << std::chrono::duration_cast<std::chrono::seconds>(metadata->interval).count()
785                 << " seconds for " << toString(metadata->eventType)
786                 << " collection cannot be less than "
787                 << std::chrono::duration_cast<std::chrono::seconds>(kMinEventInterval).count()
788                 << " seconds";
789     }
790     if (const auto result = collectLocked(metadata); !result.ok()) {
791         return Error() << toString(metadata->eventType) << " collection failed: " << result.error();
792     }
793     metadata->lastUptime += metadata->interval.count();
794     mHandlerLooper->sendMessageAtTime(metadata->lastUptime,
795                                       sp<WatchdogPerfService>::fromExisting(this),
796                                       metadata->eventType);
797     return {};
798 }
799 
collectLocked(WatchdogPerfService::EventMetadata * metadata)800 Result<void> WatchdogPerfService::collectLocked(WatchdogPerfService::EventMetadata* metadata) {
801     if (!mUidStatsCollector->enabled() && !mProcStatCollector->enabled()) {
802         return Error() << "No collectors enabled";
803     }
804 
805     time_t now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
806 
807     if (mUidStatsCollector->enabled()) {
808         if (const auto result = mUidStatsCollector->collect(); !result.ok()) {
809             return Error() << "Failed to collect per-uid proc and I/O stats: " << result.error();
810         }
811     }
812 
813     if (mProcStatCollector->enabled()) {
814         if (const auto result = mProcStatCollector->collect(); !result.ok()) {
815             return Error() << "Failed to collect proc stats: " << result.error();
816         }
817     }
818 
819     for (const auto& processor : mDataProcessors) {
820         Result<void> result;
821         switch (mCurrCollectionEvent) {
822             case EventType::BOOT_TIME_COLLECTION:
823                 result = processor->onBoottimeCollection(now, mUidStatsCollector,
824                                                          mProcStatCollector);
825                 break;
826             case EventType::PERIODIC_COLLECTION:
827                 result = processor->onPeriodicCollection(now, mSystemState, mUidStatsCollector,
828                                                          mProcStatCollector);
829                 break;
830             case EventType::USER_SWITCH_COLLECTION: {
831                 WatchdogPerfService::UserSwitchEventMetadata* userSwitchMetadata =
832                         static_cast<WatchdogPerfService::UserSwitchEventMetadata*>(metadata);
833                 result = processor->onUserSwitchCollection(now, userSwitchMetadata->from,
834                                                            userSwitchMetadata->to,
835                                                            mUidStatsCollector, mProcStatCollector);
836                 break;
837             }
838             case EventType::WAKE_UP_COLLECTION:
839                 result = processor->onWakeUpCollection(now, mUidStatsCollector, mProcStatCollector);
840                 break;
841             case EventType::CUSTOM_COLLECTION:
842                 result = processor->onCustomCollection(now, mSystemState, metadata->filterPackages,
843                                                        mUidStatsCollector, mProcStatCollector);
844                 break;
845             default:
846                 result = Error() << "Invalid collection event " << toString(mCurrCollectionEvent);
847         }
848         if (!result.ok()) {
849             return Error() << processor->name() << " failed on " << toString(mCurrCollectionEvent)
850                            << " collection: " << result.error();
851         }
852     }
853 
854     return {};
855 }
856 
processMonitorEvent(WatchdogPerfService::EventMetadata * metadata)857 Result<void> WatchdogPerfService::processMonitorEvent(
858         WatchdogPerfService::EventMetadata* metadata) {
859     if (metadata->eventType != static_cast<int>(EventType::PERIODIC_MONITOR)) {
860         return Error() << "Invalid monitor event " << toString(metadata->eventType);
861     }
862     if (DEBUG) {
863         ALOGD("Processing %s monitor event", toString(metadata->eventType));
864     }
865     if (metadata->interval < kMinEventInterval) {
866         return Error()
867                 << "Monitor interval of "
868                 << std::chrono::duration_cast<std::chrono::seconds>(metadata->interval).count()
869                 << " seconds for " << toString(metadata->eventType) << " event cannot be less than "
870                 << std::chrono::duration_cast<std::chrono::seconds>(kMinEventInterval).count()
871                 << " seconds";
872     }
873     Mutex::Autolock lock(mMutex);
874     if (!mProcDiskStatsCollector->enabled()) {
875         return Error() << "Cannot access proc disk stats for monitoring";
876     }
877     time_t now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
878     if (const auto result = mProcDiskStatsCollector->collect(); !result.ok()) {
879         return Error() << "Failed to collect disk stats: " << result.error();
880     }
881     auto* currCollectionMetadata = currCollectionMetadataLocked();
882     if (currCollectionMetadata == nullptr) {
883         return Error() << "No metadata available for current collection event: "
884                        << toString(mCurrCollectionEvent);
885     }
886     bool requestedCollection = false;
887     auto thiz = sp<WatchdogPerfService>::fromExisting(this);
888     const auto requestCollection = [&]() mutable {
889         if (requestedCollection) {
890             return;
891         }
892         const nsecs_t prevUptime =
893                 currCollectionMetadata->lastUptime - currCollectionMetadata->interval.count();
894         nsecs_t uptime = mHandlerLooper->now();
895         if (const auto delta = std::abs(uptime - prevUptime); delta < kMinEventInterval.count()) {
896             return;
897         }
898         currCollectionMetadata->lastUptime = uptime;
899         mHandlerLooper->removeMessages(thiz, currCollectionMetadata->eventType);
900         mHandlerLooper->sendMessage(thiz, currCollectionMetadata->eventType);
901         requestedCollection = true;
902     };
903     for (const auto& processor : mDataProcessors) {
904         if (const auto result =
905                     processor->onPeriodicMonitor(now, mProcDiskStatsCollector, requestCollection);
906             !result.ok()) {
907             return Error() << processor->name() << " failed on " << toString(metadata->eventType)
908                            << ": " << result.error();
909         }
910     }
911     metadata->lastUptime += metadata->interval.count();
912     if (metadata->lastUptime == currCollectionMetadata->lastUptime) {
913         /*
914          * If the |PERIODIC_MONITOR| and  *_COLLECTION events overlap, skip the |PERIODIC_MONITOR|
915          * event.
916          */
917         metadata->lastUptime += metadata->interval.count();
918     }
919     mHandlerLooper->sendMessageAtTime(metadata->lastUptime, thiz, metadata->eventType);
920     return {};
921 }
922 
notifySystemStartUpLocked()923 Result<void> WatchdogPerfService::notifySystemStartUpLocked() {
924     for (const auto& processor : mDataProcessors) {
925         if (const auto result = processor->onSystemStartup(); !result.ok()) {
926             ALOGE("%s failed to process system startup event", processor->name().c_str());
927             return Error() << processor->name() << " failed to process system startup event";
928         }
929     }
930     return {};
931 }
932 
currCollectionMetadataLocked()933 WatchdogPerfService::EventMetadata* WatchdogPerfService::currCollectionMetadataLocked() {
934     switch (mCurrCollectionEvent) {
935         case EventType::BOOT_TIME_COLLECTION:
936             return &mBoottimeCollection;
937         case EventType::PERIODIC_COLLECTION:
938             return &mPeriodicCollection;
939         case EventType::CUSTOM_COLLECTION:
940             return &mCustomCollection;
941         default:
942             return nullptr;
943     }
944 }
945 
946 }  // namespace watchdog
947 }  // namespace automotive
948 }  // namespace android
949