• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2020, The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define LOG_TAG "carwatchdogd"
18 #define DEBUG false  // STOPSHIP if true.
19 
20 #include "WatchdogPerfService.h"
21 
22 #include <android-base/file.h>
23 #include <android-base/parseint.h>
24 #include <android-base/stringprintf.h>
25 #include <android-base/strings.h>
26 #include <log/log.h>
27 #include <processgroup/sched_policy.h>
28 
29 #include <pthread.h>
30 
31 #include <iterator>
32 #include <vector>
33 
34 namespace android {
35 namespace automotive {
36 namespace watchdog {
37 
38 namespace {
39 
40 using ::aidl::android::automotive::watchdog::internal::ResourceStats;
41 using ::aidl::android::automotive::watchdog::internal::UserState;
42 using ::android::sp;
43 using ::android::String16;
44 using ::android::String8;
45 using ::android::base::EqualsIgnoreCase;
46 using ::android::base::Error;
47 using ::android::base::Join;
48 using ::android::base::ParseUint;
49 using ::android::base::Result;
50 using ::android::base::Split;
51 using ::android::base::StringAppendF;
52 using ::android::base::StringPrintf;
53 using ::android::base::WriteStringToFd;
54 
55 const int32_t kMaxCachedUnsentResourceStats = 10;
56 const std::chrono::nanoseconds kPrevUnsentResourceStatsDelayNs = 3s;
57 // Minimum required collection polling interval between subsequent collections.
58 const std::chrono::nanoseconds kMinEventInterval = 1s;
59 const std::chrono::seconds kDefaultSystemEventCollectionInterval = 1s;
60 const std::chrono::seconds kDefaultPeriodicCollectionInterval = 20s;
61 const std::chrono::seconds kDefaultPeriodicMonitorInterval = 5s;
62 const std::chrono::nanoseconds kCustomCollectionInterval = 10s;
63 const std::chrono::nanoseconds kCustomCollectionDuration = 30min;
64 
65 constexpr const char* kServiceName = "WatchdogPerfService";
66 static const std::string kDumpMajorDelimiter = std::string(100, '-') + "\n";  // NOLINT
67 constexpr const char* kHelpText =
68         "\n%s dump options:\n"
69         "%s: Starts custom performance data collection. Customize the collection behavior with "
70         "the following optional arguments:\n"
71         "\t%s <seconds>: Modifies the collection interval. Default behavior is to collect once "
72         "every %lld seconds.\n"
73         "\t%s <seconds>: Modifies the maximum collection duration. Default behavior is to collect "
74         "until %ld minutes before automatically stopping the custom collection and discarding "
75         "the collected data.\n"
76         "\t%s <package name>,<package name>,...: Comma-separated value containing package names. "
77         "When provided, the results are filtered only to the provided package names. Default "
78         "behavior is to list the results for the top N packages.\n"
79         "%s: Stops custom performance data collection and generates a dump of "
80         "the collection report.\n\n"
81         "When no options are specified, the car watchdog report contains the performance data "
82         "collected during boot-time and over the last few minutes before the report generation.\n";
83 
parseSecondsFlag(const char ** args,uint32_t numArgs,size_t pos)84 Result<std::chrono::seconds> parseSecondsFlag(const char** args, uint32_t numArgs, size_t pos) {
85     if (numArgs <= pos) {
86         return Error() << "Value not provided";
87     }
88     uint64_t value;
89     if (std::string strValue = std::string(args[pos]); !ParseUint(strValue, &value)) {
90         return Error() << "Invalid value " << strValue << ", must be an integer";
91     }
92     return std::chrono::seconds(value);
93 }
94 
toString(std::variant<EventType,SwitchMessage> what)95 constexpr const char* toString(std::variant<EventType, SwitchMessage> what) {
96     return std::visit(
97             [&](const auto& v) -> const char* {
98                 switch (static_cast<int>(v)) {
99                     case EventType::INIT:
100                         return "INIT";
101                     case EventType::TERMINATED:
102                         return "TERMINATED";
103                     case EventType::BOOT_TIME_COLLECTION:
104                         return "BOOT_TIME_COLLECTION";
105                     case EventType::PERIODIC_COLLECTION:
106                         return "PERIODIC_COLLECTION";
107                     case EventType::USER_SWITCH_COLLECTION:
108                         return "USER_SWITCH_COLLECTION";
109                     case EventType::WAKE_UP_COLLECTION:
110                         return "WAKE_UP_COLLECTION";
111                     case EventType::CUSTOM_COLLECTION:
112                         return "CUSTOM_COLLECTION";
113                     case EventType::PERIODIC_MONITOR:
114                         return "PERIODIC_MONITOR";
115                     case EventType::LAST_EVENT:
116                         return "LAST_EVENT";
117                     case SwitchMessage::END_BOOTTIME_COLLECTION:
118                         return "END_BOOTTIME_COLLECTION";
119                     case SwitchMessage::END_USER_SWITCH_COLLECTION:
120                         return "END_USER_SWITCH_COLLECTION";
121                     case SwitchMessage::END_WAKE_UP_COLLECTION:
122                         return "END_WAKE_UP_COLLECTION";
123                     case SwitchMessage::END_CUSTOM_COLLECTION:
124                         return "END_CUSTOM_COLLECTION";
125                     default:
126                         return "INVALID_EVENT_OR_SWITCH_MESSAGE";
127                 }
128             },
129             what);
130 }
131 
toString(SystemState systemState)132 constexpr const char* toString(SystemState systemState) {
133     switch (systemState) {
134         case SystemState::NORMAL_MODE:
135             return "NORMAL_MODE";
136         case SystemState::GARAGE_MODE:
137             return "GARAGE_MODE";
138         default:
139             return "UNKNOWN MODE";
140     }
141 }
142 
isEmpty(const ResourceStats & resourceStats)143 bool isEmpty(const ResourceStats& resourceStats) {
144     return !resourceStats.resourceUsageStats.has_value() &&
145             !resourceStats.resourceOveruseStats.has_value();
146 }
147 
148 }  // namespace
149 
toString() const150 std::string WatchdogPerfService::EventMetadata::toString() const {
151     std::string buffer;
152     const auto intervalInSecs =
153             std::chrono::duration_cast<std::chrono::seconds>(pollingIntervalNs).count();
154     StringAppendF(&buffer, "Event polling interval: %lld second%s\n", intervalInSecs,
155                   ((intervalInSecs > 1) ? "s" : ""));
156     if (!filterPackages.empty()) {
157         std::vector<std::string> packages(filterPackages.begin(), filterPackages.end());
158         StringAppendF(&buffer, "Filtered results to packages: %s\n", Join(packages, ", ").c_str());
159     }
160     return buffer;
161 }
162 
registerDataProcessor(sp<DataProcessorInterface> processor)163 Result<void> WatchdogPerfService::registerDataProcessor(sp<DataProcessorInterface> processor) {
164     if (processor == nullptr) {
165         return Error() << "Must provide a valid data processor";
166     }
167     if (const auto result = processor->init(); !result.ok()) {
168         return Error() << "Failed to initialize " << processor->name().c_str() << ": "
169                        << result.error().message();
170     }
171     Mutex::Autolock lock(mMutex);
172     mDataProcessors.push_back(processor);
173     if (DEBUG) {
174         ALOGD("Successfully registered %s to %s", processor->name().c_str(), kServiceName);
175     }
176     return {};
177 }
178 
start()179 Result<void> WatchdogPerfService::start() {
180     {
181         Mutex::Autolock lock(mMutex);
182         if (mCurrCollectionEvent != EventType::INIT || mCollectionThread.joinable()) {
183             return Error(INVALID_OPERATION) << "Cannot start " << kServiceName << " more than once";
184         }
185         if (mWatchdogServiceHelper == nullptr) {
186             return Error(INVALID_OPERATION) << "No watchdog service helper is registered";
187         }
188         std::chrono::nanoseconds systemEventCollectionInterval =
189                 std::chrono::duration_cast<std::chrono::nanoseconds>(
190                         std::chrono::seconds(sysprop::systemEventCollectionInterval().value_or(
191                                 kDefaultSystemEventCollectionInterval.count())));
192         std::chrono::nanoseconds periodicCollectionInterval =
193                 std::chrono::duration_cast<std::chrono::nanoseconds>(
194                         std::chrono::seconds(sysprop::periodicCollectionInterval().value_or(
195                                 kDefaultPeriodicCollectionInterval.count())));
196         std::chrono::nanoseconds periodicMonitorInterval =
197                 std::chrono::duration_cast<std::chrono::nanoseconds>(
198                         std::chrono::seconds(sysprop::periodicMonitorInterval().value_or(
199                                 kDefaultPeriodicMonitorInterval.count())));
200         mBoottimeCollection = {
201                 .eventType = EventType::BOOT_TIME_COLLECTION,
202                 .pollingIntervalNs = systemEventCollectionInterval,
203         };
204         mPeriodicCollection = {
205                 .eventType = EventType::PERIODIC_COLLECTION,
206                 .pollingIntervalNs = periodicCollectionInterval,
207         };
208         mUserSwitchCollection = {{
209                 .eventType = EventType::USER_SWITCH_COLLECTION,
210                 .pollingIntervalNs = systemEventCollectionInterval,
211         }};
212         mWakeUpCollection = {
213                 .eventType = EventType::WAKE_UP_COLLECTION,
214                 .pollingIntervalNs = systemEventCollectionInterval,
215         };
216         mPeriodicMonitor = {
217                 .eventType = EventType::PERIODIC_MONITOR,
218                 .pollingIntervalNs = periodicMonitorInterval,
219         };
220         if (mDataProcessors.empty()) {
221             ALOGE("Terminating %s: No data processor is registered", kServiceName);
222             mCurrCollectionEvent = EventType::TERMINATED;
223             return Error() << "No data processor is registered";
224         }
225         mUidStatsCollector->init();
226         mProcStatCollector->init();
227         mProcDiskStatsCollector->init();
228     }
229 
230     mCollectionThread = std::thread([&]() {
231         {
232             Mutex::Autolock lock(mMutex);
233             if (EventType expected = EventType::INIT; mCurrCollectionEvent != expected) {
234                 ALOGE("Skipping performance data collection as the current collection event "
235                       "%s != %s",
236                       toString(mCurrCollectionEvent), toString(expected));
237                 return;
238             }
239             notifySystemStartUpLocked();
240             mCurrCollectionEvent = EventType::BOOT_TIME_COLLECTION;
241             mBoottimeCollection.lastPollUptimeNs = mHandlerLooper->now();
242             mHandlerLooper->setLooper(Looper::prepare(/*opts=*/0));
243             mHandlerLooper->sendMessage(sp<WatchdogPerfService>::fromExisting(this),
244                                         EventType::BOOT_TIME_COLLECTION);
245         }
246         if (set_sched_policy(0, SP_BACKGROUND) != 0) {
247             ALOGW("Failed to set background scheduling priority to %s thread", kServiceName);
248         }
249         if (int result = pthread_setname_np(pthread_self(), "WatchdogPerfSvc"); result != 0) {
250             ALOGE("Failed to set %s thread name: %d", kServiceName, result);
251         }
252         ALOGI("Starting %s performance data collection", toString(mCurrCollectionEvent));
253         bool isCollectionActive = true;
254         /*
255          * Loop until the collection is not active -- performance collection runs on this thread in
256          * a handler.
257          */
258         while (isCollectionActive) {
259             mHandlerLooper->pollAll(/*timeoutMillis=*/-1);
260             Mutex::Autolock lock(mMutex);
261             isCollectionActive = mCurrCollectionEvent != EventType::TERMINATED;
262         }
263     });
264     return {};
265 }
266 
terminate()267 void WatchdogPerfService::terminate() {
268     {
269         Mutex::Autolock lock(mMutex);
270         if (mCurrCollectionEvent == EventType::TERMINATED) {
271             ALOGE("%s was terminated already", kServiceName);
272             return;
273         }
274         ALOGE("Terminating %s as car watchdog is terminating", kServiceName);
275         if (mCurrCollectionEvent != EventType::INIT) {
276             /*
277              * Looper runs only after EventType::INIT has completed so remove looper messages
278              * and wake the looper only when the current collection has changed from INIT.
279              */
280             mHandlerLooper->removeMessages(sp<WatchdogPerfService>::fromExisting(this));
281             mHandlerLooper->wake();
282         }
283         for (const auto& processor : mDataProcessors) {
284             processor->terminate();
285         }
286         mCurrCollectionEvent = EventType::TERMINATED;
287         mUnsentResourceStats.clear();
288     }
289     if (mCollectionThread.joinable()) {
290         mCollectionThread.join();
291         if (DEBUG) {
292             ALOGD("%s collection thread terminated", kServiceName);
293         }
294     }
295 }
296 
setSystemState(SystemState systemState)297 void WatchdogPerfService::setSystemState(SystemState systemState) {
298     Mutex::Autolock lock(mMutex);
299     if (mSystemState != systemState) {
300         ALOGI("%s switching from %s to %s", kServiceName, toString(mSystemState),
301               toString(systemState));
302     }
303     mSystemState = systemState;
304 }
305 
onCarWatchdogServiceRegistered()306 void WatchdogPerfService::onCarWatchdogServiceRegistered() {
307     Mutex::Autolock lock(mMutex);
308     for (const auto& processor : mDataProcessors) {
309         processor->onCarWatchdogServiceRegistered();
310     }
311     if (mUnsentResourceStats.empty()) {
312         return;
313     }
314     mHandlerLooper->sendMessage(sp<WatchdogPerfService>::fromExisting(this),
315                                 TaskMessage::SEND_RESOURCE_STATS);
316 }
317 
onBootFinished()318 Result<void> WatchdogPerfService::onBootFinished() {
319     Mutex::Autolock lock(mMutex);
320     if (EventType expected = EventType::BOOT_TIME_COLLECTION; mCurrCollectionEvent != expected) {
321         /*
322          * This case happens when either the WatchdogPerfService has prematurely terminated before
323          * boot complete notification is received or multiple boot complete notifications are
324          * received. In either case don't return error as this will lead to runtime exception and
325          * cause system to boot loop.
326          */
327         ALOGE("Current performance data collection event %s != %s", toString(mCurrCollectionEvent),
328               toString(expected));
329         return {};
330     }
331     mHandlerLooper->sendMessageAtTime(mHandlerLooper->now() + mPostSystemEventDurationNs.count(),
332                                       sp<WatchdogPerfService>::fromExisting(this),
333                                       SwitchMessage::END_BOOTTIME_COLLECTION);
334     if (DEBUG) {
335         ALOGD("Boot complete signal received.");
336     }
337     return {};
338 }
339 
onUserStateChange(userid_t userId,const UserState & userState)340 Result<void> WatchdogPerfService::onUserStateChange(userid_t userId, const UserState& userState) {
341     Mutex::Autolock lock(mMutex);
342     if (mCurrCollectionEvent == EventType::BOOT_TIME_COLLECTION ||
343         mCurrCollectionEvent == EventType::CUSTOM_COLLECTION) {
344         // Ignoring the user switch events because the boot-time and custom collections take
345         // precedence over other collections.
346         if (mCurrCollectionEvent == EventType::CUSTOM_COLLECTION) {
347             ALOGW("Unable to start %s. Current performance data collection event: %s",
348                   toString(EventType::USER_SWITCH_COLLECTION), toString(mCurrCollectionEvent));
349         }
350         return {};
351     }
352     switch (static_cast<int>(userState)) {
353         case static_cast<int>(UserState::USER_STATE_SWITCHING):
354             // TODO(b/243984863): Handle multi-user switching scenario.
355             mUserSwitchCollection.from = mUserSwitchCollection.to;
356             mUserSwitchCollection.to = userId;
357             if (mCurrCollectionEvent != EventType::PERIODIC_COLLECTION &&
358                 mCurrCollectionEvent != EventType::USER_SWITCH_COLLECTION) {
359                 ALOGE("Unable to start %s. Current performance data collection event: %s",
360                       toString(EventType::USER_SWITCH_COLLECTION), toString(mCurrCollectionEvent));
361                 return {};
362             }
363             startUserSwitchCollection();
364             ALOGI("Switching to %s (userIds: from = %d, to = %d)", toString(mCurrCollectionEvent),
365                   mUserSwitchCollection.from, mUserSwitchCollection.to);
366             break;
367         case static_cast<int>(UserState::USER_STATE_UNLOCKING):
368             if (mCurrCollectionEvent != EventType::PERIODIC_COLLECTION) {
369                 if (mCurrCollectionEvent != EventType::USER_SWITCH_COLLECTION) {
370                     ALOGE("Unable to start %s. Current performance data collection event: %s",
371                           toString(EventType::USER_SWITCH_COLLECTION),
372                           toString(mCurrCollectionEvent));
373                 }
374                 return {};
375             }
376             if (mUserSwitchCollection.to != userId) {
377                 return {};
378             }
379             startUserSwitchCollection();
380             ALOGI("Switching to %s (userId: %d)", toString(mCurrCollectionEvent), userId);
381             break;
382         case static_cast<int>(UserState::USER_STATE_POST_UNLOCKED): {
383             if (mCurrCollectionEvent != EventType::USER_SWITCH_COLLECTION) {
384                 ALOGE("Ignoring USER_STATE_POST_UNLOCKED because no user switch collection in "
385                       "progress. Current performance data collection event: %s.",
386                       toString(mCurrCollectionEvent));
387                 return {};
388             }
389             if (mUserSwitchCollection.to != userId) {
390                 ALOGE("Ignoring USER_STATE_POST_UNLOCKED signal for user id: %d. "
391                       "Current user being switched to: %d",
392                       userId, mUserSwitchCollection.to);
393                 return {};
394             }
395             auto thiz = sp<WatchdogPerfService>::fromExisting(this);
396             mHandlerLooper->removeMessages(thiz, SwitchMessage::END_USER_SWITCH_COLLECTION);
397             nsecs_t endUserSwitchCollectionTime =
398                     mHandlerLooper->now() + mPostSystemEventDurationNs.count();
399             mHandlerLooper->sendMessageAtTime(endUserSwitchCollectionTime, thiz,
400                                               SwitchMessage::END_USER_SWITCH_COLLECTION);
401             break;
402         }
403         default:
404             ALOGE("Unsupported user state: %d", static_cast<int>(userState));
405             return {};
406     }
407     if (DEBUG) {
408         ALOGD("Handled user state change: userId = %d, userState = %d", userId,
409               static_cast<int>(userState));
410     }
411     return {};
412 }
413 
startUserSwitchCollection()414 Result<void> WatchdogPerfService::startUserSwitchCollection() {
415     auto thiz = sp<WatchdogPerfService>::fromExisting(this);
416     mHandlerLooper->removeMessages(thiz);
417     mUserSwitchCollection.lastPollUptimeNs = mHandlerLooper->now();
418     // End |EventType::USER_SWITCH_COLLECTION| after a timeout because the user switch end
419     // signal won't be received within a few seconds when the switch is blocked due to a
420     // keyguard event. Otherwise, polling beyond a few seconds will lead to unnecessary data
421     // collection.
422     mHandlerLooper->sendMessageAtTime(mHandlerLooper->now() + mUserSwitchTimeoutNs.count(), thiz,
423                                       SwitchMessage::END_USER_SWITCH_COLLECTION);
424     mCurrCollectionEvent = EventType::USER_SWITCH_COLLECTION;
425     mHandlerLooper->sendMessage(thiz, EventType::USER_SWITCH_COLLECTION);
426     return {};
427 }
428 
onSuspendExit()429 Result<void> WatchdogPerfService::onSuspendExit() {
430     Mutex::Autolock lock(mMutex);
431     if (mCurrCollectionEvent == EventType::CUSTOM_COLLECTION) {
432         // Ignoring the suspend exit event because the custom collection takes
433         // precedence over other collections.
434         ALOGE("Unable to start %s. Current performance data collection event: %s",
435               toString(EventType::WAKE_UP_COLLECTION), toString(mCurrCollectionEvent));
436         return {};
437     }
438     if (mCurrCollectionEvent == EventType::WAKE_UP_COLLECTION) {
439         ALOGE("The current performance data collection event is already %s",
440               toString(EventType::WAKE_UP_COLLECTION));
441         return {};
442     }
443     notifySystemStartUpLocked();
444     auto thiz = sp<WatchdogPerfService>::fromExisting(this);
445     mHandlerLooper->removeMessages(thiz);
446     nsecs_t now = mHandlerLooper->now();
447     mWakeUpCollection.lastPollUptimeNs = now;
448     mHandlerLooper->sendMessageAtTime(now + mWakeUpDurationNs.count(), thiz,
449                                       SwitchMessage::END_WAKE_UP_COLLECTION);
450     mCurrCollectionEvent = EventType::WAKE_UP_COLLECTION;
451     mHandlerLooper->sendMessage(thiz, EventType::WAKE_UP_COLLECTION);
452     ALOGI("Switching to %s", toString(mCurrCollectionEvent));
453     return {};
454 }
455 
onShutdownEnter()456 Result<void> WatchdogPerfService::onShutdownEnter() {
457     Mutex::Autolock lock(mMutex);
458     if (mCurrCollectionEvent == EventType::CUSTOM_COLLECTION) {
459         ALOGI("Unable to switch to %s during shutdown enter. Current performance data collection "
460               "event: %s",
461               toString(EventType::PERIODIC_COLLECTION), toString(mCurrCollectionEvent));
462         return {};
463     }
464     switchToPeriodicLocked(/*startNow=*/true);
465     return {};
466 }
467 
onCustomCollection(int fd,const char ** args,uint32_t numArgs)468 Result<void> WatchdogPerfService::onCustomCollection(int fd, const char** args, uint32_t numArgs) {
469     if (numArgs == 0) {
470         return Error(BAD_VALUE) << "No custom collection dump arguments";
471     }
472 
473     if (EqualsIgnoreCase(args[0], kStartCustomCollectionFlag)) {
474         if (numArgs > 7) {
475             return Error(BAD_VALUE) << "Number of arguments to start custom performance data "
476                                     << "collection cannot exceed 7";
477         }
478         std::chrono::nanoseconds interval = kCustomCollectionInterval;
479         std::chrono::nanoseconds maxDuration = kCustomCollectionDuration;
480         std::unordered_set<std::string> filterPackages;
481         for (uint32_t i = 1; i < numArgs; ++i) {
482             if (EqualsIgnoreCase(args[i], kIntervalFlag)) {
483                 const auto& result = parseSecondsFlag(args, numArgs, i + 1);
484                 if (!result.ok()) {
485                     return Error(BAD_VALUE)
486                             << "Failed to parse " << kIntervalFlag << ": " << result.error();
487                 }
488                 interval = std::chrono::duration_cast<std::chrono::nanoseconds>(*result);
489                 ++i;
490                 continue;
491             }
492             if (EqualsIgnoreCase(args[i], kMaxDurationFlag)) {
493                 const auto& result = parseSecondsFlag(args, numArgs, i + 1);
494                 if (!result.ok()) {
495                     return Error(BAD_VALUE)
496                             << "Failed to parse " << kMaxDurationFlag << ": " << result.error();
497                 }
498                 maxDuration = std::chrono::duration_cast<std::chrono::nanoseconds>(*result);
499                 ++i;
500                 continue;
501             }
502             if (EqualsIgnoreCase(args[i], kFilterPackagesFlag)) {
503                 if (numArgs < i + 1) {
504                     return Error(BAD_VALUE)
505                             << "Must provide value for '" << kFilterPackagesFlag << "' flag";
506                 }
507                 std::vector<std::string> packages = Split(std::string(args[i + 1]), ",");
508                 std::copy(packages.begin(), packages.end(),
509                           std::inserter(filterPackages, filterPackages.end()));
510                 ++i;
511                 continue;
512             }
513             return Error(BAD_VALUE) << "Unknown flag " << args[i]
514                                     << " provided to start custom performance data collection";
515         }
516         if (const auto& result = startCustomCollection(interval, maxDuration, filterPackages);
517             !result.ok()) {
518             return result;
519         }
520         return {};
521     }
522     if (EqualsIgnoreCase(args[0], kEndCustomCollectionFlag)) {
523         if (numArgs != 1) {
524             ALOGW("Number of arguments to stop custom performance data collection cannot exceed 1. "
525                   "Stopping the data collection.");
526             WriteStringToFd("Number of arguments to stop custom performance data collection "
527                             "cannot exceed 1. Stopping the data collection.",
528                             fd);
529         }
530         return endCustomCollection(fd);
531     }
532     return Error(BAD_VALUE) << "Custom perf collection dump arguments start neither with "
533                             << kStartCustomCollectionFlag << " nor with "
534                             << kEndCustomCollectionFlag << " flags";
535 }
536 
onDump(int fd) const537 Result<void> WatchdogPerfService::onDump(int fd) const {
538     Mutex::Autolock lock(mMutex);
539     if (mCurrCollectionEvent == EventType::TERMINATED) {
540         ALOGW("%s not active. Dumping cached data", kServiceName);
541         if (!WriteStringToFd(StringPrintf("%s not active. Dumping cached data.", kServiceName),
542                              fd)) {
543             return Error(FAILED_TRANSACTION) << "Failed to write " << kServiceName << " status";
544         }
545     }
546 
547     if (const auto& result = dumpCollectorsStatusLocked(fd); !result.ok()) {
548         return Error(FAILED_TRANSACTION) << result.error();
549     }
550 
551     if (!WriteStringToFd(StringPrintf("\n%s%s report:\n%sBoot-time collection information:\n%s\n",
552                                       kDumpMajorDelimiter.c_str(), kServiceName,
553                                       kDumpMajorDelimiter.c_str(), std::string(33, '=').c_str()),
554                          fd) ||
555         !WriteStringToFd(mBoottimeCollection.toString(), fd) ||
556         !WriteStringToFd(StringPrintf("\nWake-up collection information:\n%s\n",
557                                       std::string(31, '=').c_str()),
558                          fd) ||
559         !WriteStringToFd(mWakeUpCollection.toString(), fd) ||
560         !WriteStringToFd(StringPrintf("\nUser-switch collection information:\n%s\n",
561                                       std::string(35, '=').c_str()),
562                          fd) ||
563         !WriteStringToFd(mUserSwitchCollection.toString(), fd) ||
564         !WriteStringToFd(StringPrintf("\nPeriodic collection information:\n%s\n",
565                                       std::string(32, '=').c_str()),
566                          fd) ||
567         !WriteStringToFd(mPeriodicCollection.toString(), fd)) {
568         return Error(FAILED_TRANSACTION)
569                 << "Failed to dump the boot-time and periodic collection reports.";
570     }
571 
572     for (const auto& processor : mDataProcessors) {
573         if (const auto result = processor->onDump(fd); !result.ok()) {
574             return result;
575         }
576     }
577 
578     WriteStringToFd(kDumpMajorDelimiter, fd);
579     return {};
580 }
581 
dumpHelpText(int fd) const582 bool WatchdogPerfService::dumpHelpText(int fd) const {
583     return WriteStringToFd(StringPrintf(kHelpText, kServiceName, kStartCustomCollectionFlag,
584                                         kIntervalFlag,
585                                         std::chrono::duration_cast<std::chrono::seconds>(
586                                                 kCustomCollectionInterval)
587                                                 .count(),
588                                         kMaxDurationFlag,
589                                         std::chrono::duration_cast<std::chrono::minutes>(
590                                                 kCustomCollectionDuration)
591                                                 .count(),
592                                         kFilterPackagesFlag, kEndCustomCollectionFlag),
593                            fd);
594 }
595 
dumpCollectorsStatusLocked(int fd) const596 Result<void> WatchdogPerfService::dumpCollectorsStatusLocked(int fd) const {
597     if (!mUidStatsCollector->enabled() &&
598         !WriteStringToFd(StringPrintf("UidStatsCollector failed to access proc and I/O files"),
599                          fd)) {
600         return Error() << "Failed to write UidStatsCollector status";
601     }
602     if (!mProcStatCollector->enabled() &&
603         !WriteStringToFd(StringPrintf("ProcStat collector failed to access the file %s",
604                                       mProcStatCollector->filePath().c_str()),
605                          fd)) {
606         return Error() << "Failed to write ProcStat collector status";
607     }
608     return {};
609 }
610 
startCustomCollection(std::chrono::nanoseconds interval,std::chrono::nanoseconds maxDuration,const std::unordered_set<std::string> & filterPackages)611 Result<void> WatchdogPerfService::startCustomCollection(
612         std::chrono::nanoseconds interval, std::chrono::nanoseconds maxDuration,
613         const std::unordered_set<std::string>& filterPackages) {
614     if (interval < kMinEventInterval || maxDuration < kMinEventInterval) {
615         return Error(INVALID_OPERATION)
616                 << "Collection polling interval and maximum duration must be >= "
617                 << std::chrono::duration_cast<std::chrono::milliseconds>(kMinEventInterval).count()
618                 << " milliseconds";
619     }
620     Mutex::Autolock lock(mMutex);
621     if (mCurrCollectionEvent == EventType::CUSTOM_COLLECTION) {
622         return Error(INVALID_OPERATION) << "Cannot start custom collection more than once";
623     }
624     nsecs_t now = mHandlerLooper->now();
625     mCustomCollection = {
626             .eventType = EventType::CUSTOM_COLLECTION,
627             .pollingIntervalNs = interval,
628             .lastPollUptimeNs = now,
629             .filterPackages = filterPackages,
630     };
631 
632     auto thiz = sp<WatchdogPerfService>::fromExisting(this);
633     mHandlerLooper->removeMessages(thiz);
634     mHandlerLooper->sendMessageAtTime(now + maxDuration.count(), thiz,
635                                       SwitchMessage::END_CUSTOM_COLLECTION);
636     mCurrCollectionEvent = EventType::CUSTOM_COLLECTION;
637     mHandlerLooper->sendMessage(thiz, EventType::CUSTOM_COLLECTION);
638     ALOGI("Starting %s performance data collection", toString(mCurrCollectionEvent));
639     return {};
640 }
641 
endCustomCollection(int fd)642 Result<void> WatchdogPerfService::endCustomCollection(int fd) {
643     Mutex::Autolock lock(mMutex);
644     if (mCurrCollectionEvent != EventType::CUSTOM_COLLECTION) {
645         return Error(INVALID_OPERATION) << "No custom collection is running";
646     }
647 
648     auto thiz = sp<WatchdogPerfService>::fromExisting(this);
649     mHandlerLooper->removeMessages(thiz);
650     mHandlerLooper->sendMessage(thiz, SwitchMessage::END_CUSTOM_COLLECTION);
651 
652     if (const auto result = dumpCollectorsStatusLocked(fd); !result.ok()) {
653         return Error(FAILED_TRANSACTION) << result.error();
654     }
655 
656     if (!WriteStringToFd(StringPrintf("%sPerformance data report for custom collection:\n%s",
657                                       kDumpMajorDelimiter.c_str(), kDumpMajorDelimiter.c_str()),
658                          fd) ||
659         !WriteStringToFd(mCustomCollection.toString(), fd)) {
660         return Error(FAILED_TRANSACTION) << "Failed to write custom collection report.";
661     }
662 
663     for (const auto& processor : mDataProcessors) {
664         if (const auto result = processor->onCustomCollectionDump(fd); !result.ok()) {
665             return Error(FAILED_TRANSACTION)
666                     << processor->name() << " failed on " << toString(mCurrCollectionEvent)
667                     << " collection: " << result.error();
668         }
669     }
670 
671     if (DEBUG) {
672         ALOGD("Custom event finished");
673     }
674     WriteStringToFd(kDumpMajorDelimiter, fd);
675     return {};
676 }
677 
switchToPeriodicLocked(bool startNow)678 void WatchdogPerfService::switchToPeriodicLocked(bool startNow) {
679     if (mCurrCollectionEvent == EventType::PERIODIC_COLLECTION) {
680         ALOGW("The current performance data collection event is already %s",
681               toString(mCurrCollectionEvent));
682         return;
683     }
684     auto thiz = sp<WatchdogPerfService>::fromExisting(this);
685     mHandlerLooper->removeMessages(thiz);
686     mCurrCollectionEvent = EventType::PERIODIC_COLLECTION;
687     mPeriodicCollection.lastPollUptimeNs = mHandlerLooper->now();
688     if (startNow) {
689         mHandlerLooper->sendMessage(thiz, EventType::PERIODIC_COLLECTION);
690     } else {
691         mPeriodicCollection.lastPollUptimeNs += mPeriodicCollection.pollingIntervalNs.count();
692         mHandlerLooper->sendMessageAtTime(mPeriodicCollection.lastPollUptimeNs, thiz,
693                                           EventType::PERIODIC_COLLECTION);
694     }
695     mPeriodicMonitor.lastPollUptimeNs =
696             mHandlerLooper->now() + mPeriodicMonitor.pollingIntervalNs.count();
697     mHandlerLooper->sendMessageAtTime(mPeriodicMonitor.lastPollUptimeNs, thiz,
698                                       EventType::PERIODIC_MONITOR);
699     ALOGI("Switching to %s and %s", toString(mCurrCollectionEvent),
700           toString(EventType::PERIODIC_MONITOR));
701 }
702 
handleMessage(const Message & message)703 void WatchdogPerfService::handleMessage(const Message& message) {
704     Result<void> result;
705 
706     switch (message.what) {
707         case static_cast<int>(EventType::BOOT_TIME_COLLECTION):
708             result = processCollectionEvent(&mBoottimeCollection);
709             break;
710         case static_cast<int>(SwitchMessage::END_BOOTTIME_COLLECTION):
711             mHandlerLooper->removeMessages(sp<WatchdogPerfService>::fromExisting(this));
712             if (result = processCollectionEvent(&mBoottimeCollection); result.ok()) {
713                 Mutex::Autolock lock(mMutex);
714                 switchToPeriodicLocked(/*startNow=*/false);
715             }
716             break;
717         case static_cast<int>(EventType::PERIODIC_COLLECTION):
718             result = processCollectionEvent(&mPeriodicCollection);
719             break;
720         case static_cast<int>(EventType::USER_SWITCH_COLLECTION):
721             result = processCollectionEvent(&mUserSwitchCollection);
722             break;
723         case static_cast<int>(EventType::WAKE_UP_COLLECTION):
724             result = processCollectionEvent(&mWakeUpCollection);
725             break;
726         case static_cast<int>(SwitchMessage::END_USER_SWITCH_COLLECTION):
727         case static_cast<int>(SwitchMessage::END_WAKE_UP_COLLECTION): {
728             mHandlerLooper->removeMessages(sp<WatchdogPerfService>::fromExisting(this));
729             EventMetadata* eventMetadata =
730                     message.what == static_cast<int>(SwitchMessage::END_USER_SWITCH_COLLECTION)
731                     ? &mUserSwitchCollection
732                     : &mWakeUpCollection;
733             if (result = processCollectionEvent(eventMetadata); result.ok()) {
734                 Mutex::Autolock lock(mMutex);
735                 switchToPeriodicLocked(/*startNow=*/false);
736             }
737             break;
738         }
739         case static_cast<int>(EventType::CUSTOM_COLLECTION):
740             result = processCollectionEvent(&mCustomCollection);
741             break;
742         case static_cast<int>(EventType::PERIODIC_MONITOR):
743             result = processMonitorEvent(&mPeriodicMonitor);
744             break;
745         case static_cast<int>(SwitchMessage::END_CUSTOM_COLLECTION): {
746             Mutex::Autolock lock(mMutex);
747             if (EventType expected = EventType::CUSTOM_COLLECTION;
748                 mCurrCollectionEvent != expected) {
749                 ALOGW("Skipping END_CUSTOM_COLLECTION message as the current collection %s != %s",
750                       toString(mCurrCollectionEvent), toString(expected));
751                 return;
752             }
753             mCustomCollection = {};
754             for (const auto& processor : mDataProcessors) {
755                 /*
756                  * Clear custom collection cache on the data processors when the custom collection
757                  * ends.
758                  */
759                 processor->onCustomCollectionDump(-1);
760             }
761             switchToPeriodicLocked(/*startNow=*/true);
762             return;
763         }
764         case static_cast<int>(TaskMessage::SEND_RESOURCE_STATS):
765             result = sendResourceStats();
766             break;
767         default:
768             result = Error() << "Unknown message: " << message.what;
769     }
770 
771     if (!result.ok()) {
772         Mutex::Autolock lock(mMutex);
773         ALOGE("Terminating %s: %s", kServiceName, result.error().message().c_str());
774         /*
775          * DO NOT CALL terminate() as it tries to join the collection thread but this code is
776          * executed on the collection thread. Thus it will result in a deadlock.
777          */
778         mCurrCollectionEvent = EventType::TERMINATED;
779         mHandlerLooper->removeMessages(sp<WatchdogPerfService>::fromExisting(this));
780         mHandlerLooper->wake();
781     }
782 }
783 
processCollectionEvent(WatchdogPerfService::EventMetadata * metadata)784 Result<void> WatchdogPerfService::processCollectionEvent(
785         WatchdogPerfService::EventMetadata* metadata) {
786     Mutex::Autolock lock(mMutex);
787     /*
788      * Messages sent to the looper are intrinsically racy such that a message from the previous
789      * collection event may land in the looper after the current collection has already begun. Thus
790      * verify the current collection event before starting the collection.
791      */
792     if (mCurrCollectionEvent != metadata->eventType) {
793         ALOGW("Skipping %s event on collection event %s", toString(metadata->eventType),
794               toString(mCurrCollectionEvent));
795         return {};
796     }
797     if (DEBUG) {
798         ALOGD("Processing %s collection event", toString(metadata->eventType));
799     }
800     if (metadata->pollingIntervalNs < kMinEventInterval) {
801         return Error()
802                 << "Collection polling interval of "
803                 << std::chrono::duration_cast<std::chrono::seconds>(metadata->pollingIntervalNs)
804                            .count()
805                 << " seconds for " << toString(metadata->eventType)
806                 << " collection cannot be less than "
807                 << std::chrono::duration_cast<std::chrono::seconds>(kMinEventInterval).count()
808                 << " seconds";
809     }
810     if (const auto result = collectLocked(metadata); !result.ok()) {
811         return Error() << toString(metadata->eventType) << " collection failed: " << result.error();
812     }
813     metadata->lastPollUptimeNs += metadata->pollingIntervalNs.count();
814     mHandlerLooper->sendMessageAtTime(metadata->lastPollUptimeNs,
815                                       sp<WatchdogPerfService>::fromExisting(this),
816                                       metadata->eventType);
817     return {};
818 }
819 
collectLocked(WatchdogPerfService::EventMetadata * metadata)820 Result<void> WatchdogPerfService::collectLocked(WatchdogPerfService::EventMetadata* metadata) {
821     if (!mUidStatsCollector->enabled() && !mProcStatCollector->enabled()) {
822         return Error() << "No collectors enabled";
823     }
824 
825     time_t now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
826 
827     if (mUidStatsCollector->enabled()) {
828         if (const auto result = mUidStatsCollector->collect(); !result.ok()) {
829             return Error() << "Failed to collect per-uid proc and I/O stats: " << result.error();
830         }
831     }
832 
833     if (mProcStatCollector->enabled()) {
834         if (const auto result = mProcStatCollector->collect(); !result.ok()) {
835             return Error() << "Failed to collect proc stats: " << result.error();
836         }
837     }
838 
839     ResourceStats resourceStats = {};
840 
841     for (const auto& processor : mDataProcessors) {
842         Result<void> result;
843         switch (mCurrCollectionEvent) {
844             case EventType::BOOT_TIME_COLLECTION:
845                 result = processor->onBoottimeCollection(now, mUidStatsCollector,
846                                                          mProcStatCollector, &resourceStats);
847                 break;
848             case EventType::PERIODIC_COLLECTION:
849                 result = processor->onPeriodicCollection(now, mSystemState, mUidStatsCollector,
850                                                          mProcStatCollector, &resourceStats);
851                 break;
852             case EventType::USER_SWITCH_COLLECTION: {
853                 WatchdogPerfService::UserSwitchEventMetadata* userSwitchMetadata =
854                         static_cast<WatchdogPerfService::UserSwitchEventMetadata*>(metadata);
855                 result = processor->onUserSwitchCollection(now, userSwitchMetadata->from,
856                                                            userSwitchMetadata->to,
857                                                            mUidStatsCollector, mProcStatCollector);
858                 break;
859             }
860             case EventType::WAKE_UP_COLLECTION:
861                 result = processor->onWakeUpCollection(now, mUidStatsCollector, mProcStatCollector);
862                 break;
863             case EventType::CUSTOM_COLLECTION:
864                 result = processor->onCustomCollection(now, mSystemState, metadata->filterPackages,
865                                                        mUidStatsCollector, mProcStatCollector,
866                                                        &resourceStats);
867                 break;
868             default:
869                 result = Error() << "Invalid collection event " << toString(mCurrCollectionEvent);
870         }
871         if (!result.ok()) {
872             return Error() << processor->name() << " failed on " << toString(mCurrCollectionEvent)
873                            << " collection: " << result.error();
874         }
875     }
876 
877     if (!isEmpty(resourceStats)) {
878         cacheUnsentResourceStatsLocked(std::move(resourceStats));
879     }
880 
881     if (mUnsentResourceStats.empty() || !mWatchdogServiceHelper->isServiceConnected()) {
882         if (DEBUG && !mWatchdogServiceHelper->isServiceConnected()) {
883             ALOGD("Cannot send resource stats since CarWatchdogService not connected.");
884         }
885         return {};
886     }
887 
888     // Send message to send resource stats
889     mHandlerLooper->sendMessage(sp<WatchdogPerfService>::fromExisting(this),
890                                 TaskMessage::SEND_RESOURCE_STATS);
891 
892     return {};
893 }
894 
processMonitorEvent(WatchdogPerfService::EventMetadata * metadata)895 Result<void> WatchdogPerfService::processMonitorEvent(
896         WatchdogPerfService::EventMetadata* metadata) {
897     if (metadata->eventType != static_cast<int>(EventType::PERIODIC_MONITOR)) {
898         return Error() << "Invalid monitor event " << toString(metadata->eventType);
899     }
900     if (DEBUG) {
901         ALOGD("Processing %s monitor event", toString(metadata->eventType));
902     }
903     if (metadata->pollingIntervalNs < kMinEventInterval) {
904         return Error()
905                 << "Monitor polling interval of "
906                 << std::chrono::duration_cast<std::chrono::seconds>(metadata->pollingIntervalNs)
907                            .count()
908                 << " seconds for " << toString(metadata->eventType) << " event cannot be less than "
909                 << std::chrono::duration_cast<std::chrono::seconds>(kMinEventInterval).count()
910                 << " seconds";
911     }
912     Mutex::Autolock lock(mMutex);
913     if (!mProcDiskStatsCollector->enabled()) {
914         return Error() << "Cannot access proc disk stats for monitoring";
915     }
916     time_t now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
917     if (const auto result = mProcDiskStatsCollector->collect(); !result.ok()) {
918         return Error() << "Failed to collect disk stats: " << result.error();
919     }
920     auto* currCollectionMetadata = getCurrentCollectionMetadataLocked();
921     if (currCollectionMetadata == nullptr) {
922         return Error() << "No metadata available for current collection event: "
923                        << toString(mCurrCollectionEvent);
924     }
925     bool requestedCollection = false;
926     auto thiz = sp<WatchdogPerfService>::fromExisting(this);
927     const auto requestCollection = [&]() mutable {
928         if (requestedCollection) {
929             return;
930         }
931         const nsecs_t prevUptimeNs = currCollectionMetadata->lastPollUptimeNs -
932                 currCollectionMetadata->pollingIntervalNs.count();
933         nsecs_t uptimeNs = mHandlerLooper->now();
934         if (const auto delta = std::abs(uptimeNs - prevUptimeNs);
935             delta < kMinEventInterval.count()) {
936             return;
937         }
938         currCollectionMetadata->lastPollUptimeNs = uptimeNs;
939         mHandlerLooper->removeMessages(thiz, currCollectionMetadata->eventType);
940         mHandlerLooper->sendMessage(thiz, currCollectionMetadata->eventType);
941         requestedCollection = true;
942     };
943     for (const auto& processor : mDataProcessors) {
944         if (const auto result =
945                     processor->onPeriodicMonitor(now, mProcDiskStatsCollector, requestCollection);
946             !result.ok()) {
947             return Error() << processor->name() << " failed on " << toString(metadata->eventType)
948                            << ": " << result.error();
949         }
950     }
951     metadata->lastPollUptimeNs += metadata->pollingIntervalNs.count();
952     if (metadata->lastPollUptimeNs == currCollectionMetadata->lastPollUptimeNs) {
953         /*
954          * If the |PERIODIC_MONITOR| and  *_COLLECTION events overlap, skip the |PERIODIC_MONITOR|
955          * event.
956          */
957         metadata->lastPollUptimeNs += metadata->pollingIntervalNs.count();
958     }
959     mHandlerLooper->sendMessageAtTime(metadata->lastPollUptimeNs, thiz, metadata->eventType);
960     return {};
961 }
962 
sendResourceStats()963 Result<void> WatchdogPerfService::sendResourceStats() {
964     std::vector<ResourceStats> unsentResourceStats = {};
965     {
966         Mutex::Autolock lock(mMutex);
967         nsecs_t now = mHandlerLooper->now();
968         for (auto it = mUnsentResourceStats.begin(); it != mUnsentResourceStats.end();) {
969             if (now - std::get<nsecs_t>(*it) >= kPrevUnsentResourceStatsMaxDurationNs.count()) {
970                 // Drop the expired stats
971                 it = mUnsentResourceStats.erase(it);
972                 continue;
973             }
974             unsentResourceStats.push_back(std::get<ResourceStats>(*it));
975             ++it;
976         }
977     }
978     if (unsentResourceStats.empty()) {
979         return {};
980     }
981     if (auto status = mWatchdogServiceHelper->onLatestResourceStats(unsentResourceStats);
982         !status.isOk()) {
983         ALOGW("Failed to push the unsent resource stats to watchdog service: %s",
984               status.getDescription().c_str());
985         return {};
986     }
987     Mutex::Autolock lock(mMutex);
988     mUnsentResourceStats.clear();
989     if (DEBUG) {
990         ALOGD("Pushed latest resource usage and I/O overuse stats to watchdog service");
991     }
992     return {};
993 }
994 
notifySystemStartUpLocked()995 Result<void> WatchdogPerfService::notifySystemStartUpLocked() {
996     for (const auto& processor : mDataProcessors) {
997         if (const auto result = processor->onSystemStartup(); !result.ok()) {
998             ALOGE("%s failed to process system startup event", processor->name().c_str());
999             return Error() << processor->name() << " failed to process system startup event";
1000         }
1001     }
1002     return {};
1003 }
1004 
cacheUnsentResourceStatsLocked(ResourceStats resourceStats)1005 void WatchdogPerfService::cacheUnsentResourceStatsLocked(ResourceStats resourceStats) {
1006     mUnsentResourceStats.push_back(
1007             std::make_tuple(mHandlerLooper->now(), std::move(resourceStats)));
1008     if (mUnsentResourceStats.size() > kMaxCachedUnsentResourceStats) {
1009         mUnsentResourceStats.erase(mUnsentResourceStats.begin());
1010     }
1011 }
1012 
getCurrentCollectionMetadataLocked()1013 WatchdogPerfService::EventMetadata* WatchdogPerfService::getCurrentCollectionMetadataLocked() {
1014     switch (mCurrCollectionEvent) {
1015         case EventType::BOOT_TIME_COLLECTION:
1016             return &mBoottimeCollection;
1017         case EventType::PERIODIC_COLLECTION:
1018             return &mPeriodicCollection;
1019         case EventType::USER_SWITCH_COLLECTION:
1020             return &mUserSwitchCollection;
1021         case EventType::WAKE_UP_COLLECTION:
1022             return &mWakeUpCollection;
1023         case EventType::CUSTOM_COLLECTION:
1024             return &mCustomCollection;
1025         default:
1026             return nullptr;
1027     }
1028 }
1029 
1030 }  // namespace watchdog
1031 }  // namespace automotive
1032 }  // namespace android
1033