• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright (c) 2020, The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define LOG_TAG "carwatchdogd"
18 #define DEBUG false  // STOPSHIP if true.
19 
20 #include "WatchdogProcessService.h"
21 
22 #include "ServiceManager.h"
23 #include "UidProcStatsCollector.h"
24 #include "WatchdogServiceHelper.h"
25 
26 #include <aidl/android/hardware/automotive/vehicle/BnVehicle.h>
27 #include <aidl/android/hardware/automotive/vehicle/ProcessTerminationReason.h>
28 #include <android-base/file.h>
29 #include <android-base/macros.h>
30 #include <android-base/properties.h>
31 #include <android-base/stringprintf.h>
32 #include <android-base/strings.h>
33 #include <binder/IPCThreadState.h>
34 #include <hidl/HidlTransportSupport.h>
35 #include <utils/SystemClock.h>
36 
37 #include <IVhalClient.h>
38 #include <VehicleHalTypes.h>
39 #include <inttypes.h>
40 
41 #include <utility>
42 
43 namespace android {
44 namespace automotive {
45 namespace watchdog {
46 
47 using ::aidl::android::automotive::watchdog::ICarWatchdogClient;
48 using ::aidl::android::automotive::watchdog::TimeoutLength;
49 using ::aidl::android::automotive::watchdog::internal::ICarWatchdogMonitor;
50 using ::aidl::android::automotive::watchdog::internal::ICarWatchdogServiceForSystem;
51 using ::aidl::android::automotive::watchdog::internal::ProcessIdentifier;
52 using ::aidl::android::hardware::automotive::vehicle::BnVehicle;
53 using ::aidl::android::hardware::automotive::vehicle::ProcessTerminationReason;
54 using ::aidl::android::hardware::automotive::vehicle::StatusCode;
55 using ::aidl::android::hardware::automotive::vehicle::SubscribeOptions;
56 using ::aidl::android::hardware::automotive::vehicle::VehiclePropConfig;
57 using ::aidl::android::hardware::automotive::vehicle::VehicleProperty;
58 using ::aidl::android::hardware::automotive::vehicle::VehiclePropertyStatus;
59 using ::aidl::android::hardware::automotive::vehicle::VehiclePropValue;
60 using ::android::sp;
61 using ::android::String16;
62 using ::android::base::Error;
63 using ::android::base::GetIntProperty;
64 using ::android::base::GetProperty;
65 using ::android::base::ReadFileToString;
66 using ::android::base::Result;
67 using ::android::base::StringAppendF;
68 using ::android::base::StringPrintf;
69 using ::android::base::Trim;
70 using ::android::base::WriteStringToFd;
71 using ::android::binder::Status;
72 using ::android::frameworks::automotive::vhal::HalPropError;
73 using ::android::frameworks::automotive::vhal::IHalPropValue;
74 using ::android::frameworks::automotive::vhal::ISubscriptionClient;
75 using ::android::frameworks::automotive::vhal::IVhalClient;
76 using ::android::hardware::hidl_vec;
77 using ::android::hardware::interfacesEqual;
78 using ::android::hardware::Return;
79 using ::android::hidl::base::V1_0::IBase;
80 using ::android::hidl::manager::V1_0::IServiceManager;
81 using ::ndk::ScopedAIBinder_DeathRecipient;
82 using ::ndk::ScopedAStatus;
83 using ::ndk::SpAIBinder;
84 
85 namespace {
86 
87 const std::vector<TimeoutLength> kTimeouts = {TimeoutLength::TIMEOUT_CRITICAL,
88                                               TimeoutLength::TIMEOUT_MODERATE,
89                                               TimeoutLength::TIMEOUT_NORMAL};
90 
91 // TimeoutLength is also used as a message ID. Other message IDs should start next to
92 // TimeoutLength::TIMEOUT_NORMAL.
93 const int32_t MSG_VHAL_WATCHDOG_ALIVE = static_cast<int>(TimeoutLength::TIMEOUT_NORMAL) + 1;
94 const int32_t MSG_VHAL_HEALTH_CHECK = MSG_VHAL_WATCHDOG_ALIVE + 1;
95 const int32_t MSG_CACHE_VHAL_PROCESS_IDENTIFIER = MSG_VHAL_HEALTH_CHECK + 1;
96 
97 // VHAL is supposed to send heart beat every 3s. Car watchdog checks if there is the latest heart
98 // beat from VHAL within 3s, allowing 1s marginal time.
99 // If {@code ro.carwatchdog.vhal_healthcheck.interval} is set, car watchdog checks VHAL health at
100 // the given interval. The lower bound of the interval is 3s.
101 constexpr int32_t kDefaultVhalCheckIntervalSec = 3;
102 constexpr std::chrono::milliseconds kHealthCheckDelayMs = 1s;
103 constexpr int32_t kMaxVhalPidCachingAttempts = 2;
104 constexpr std::chrono::nanoseconds kDefaultVhalPidCachingRetryDelayNs = 30s;
105 constexpr TimeoutLength kCarWatchdogServiceTimeoutDelay = TimeoutLength::TIMEOUT_CRITICAL;
106 constexpr int32_t kMissingIntPropertyValue = -1;
107 
108 constexpr const char kPropertyVhalCheckInterval[] = "ro.carwatchdog.vhal_healthcheck.interval";
109 constexpr const char kPropertyClientCheckInterval[] = "ro.carwatchdog.client_healthcheck.interval";
110 constexpr const char kServiceName[] = "WatchdogProcessService";
111 constexpr const char kHidlVhalInterfaceName[] = "android.hardware.automotive.vehicle@2.0::IVehicle";
112 
113 const std::function<sp<IServiceManager>()> kDefaultTryGetHidlServiceManager =
__anon1c1bfe4b0202() 114         []() -> sp<IServiceManager> { return IServiceManager::tryGetService(/*getStub=*/false); };
115 
116 enum RegistrationError {
117     ERR_ILLEGAL_STATE = 0,
118     ERR_DUPLICATE_REGISTRATION,
119 };
120 
toScopedAStatus(Result<void> resultWithRegistrationError)121 ScopedAStatus toScopedAStatus(Result<void> resultWithRegistrationError) {
122     if (resultWithRegistrationError.ok()) {
123         return ScopedAStatus::ok();
124     }
125     if (resultWithRegistrationError.error().code() ==
126         RegistrationError::ERR_DUPLICATE_REGISTRATION) {
127         return ScopedAStatus::ok();
128     }
129     return ScopedAStatus::fromExceptionCodeWithMessage(EX_ILLEGAL_STATE,
130                                                        resultWithRegistrationError.error()
131                                                                .message()
132                                                                .c_str());
133 }
134 
toPidString(const std::vector<ProcessIdentifier> & processIdentifiers)135 std::string toPidString(const std::vector<ProcessIdentifier>& processIdentifiers) {
136     size_t size = processIdentifiers.size();
137     if (size == 0) {
138         return "";
139     }
140     std::string buffer;
141     StringAppendF(&buffer, "%d", processIdentifiers[0].pid);
142     for (size_t i = 1; i < size; i++) {
143         StringAppendF(&buffer, ", %d", processIdentifiers[i].pid);
144     }
145     return buffer;
146 }
147 
isSystemShuttingDown()148 bool isSystemShuttingDown() {
149     std::string sysPowerCtl;
150     std::istringstream tokenStream(GetProperty("sys.powerctl", ""));
151     std::getline(tokenStream, sysPowerCtl, ',');
152     return sysPowerCtl == "reboot" || sysPowerCtl == "shutdown";
153 }
154 
getStartTimeForPid(pid_t pid)155 int64_t getStartTimeForPid(pid_t pid) {
156     auto pidStat = UidProcStatsCollector::readStatFileForPid(pid);
157     if (!pidStat.ok()) {
158         return elapsedRealtime();
159     }
160     return pidStat->startTimeMillis;
161 }
162 
onBinderDied(void * cookie)163 void onBinderDied(void* cookie) {
164     const auto& thiz = ServiceManager::getInstance()->getWatchdogProcessService();
165     if (thiz == nullptr) {
166         return;
167     }
168     thiz->handleBinderDeath(cookie);
169 }
queryHidlServiceManagerForVhalPid(const sp<IServiceManager> & hidlServiceManager)170 Result<pid_t> queryHidlServiceManagerForVhalPid(const sp<IServiceManager>& hidlServiceManager) {
171     pid_t pid = -1;
172     Return<void> ret = hidlServiceManager->debugDump([&](auto& hals) {
173         for (const auto& info : hals) {
174             if (info.pid == static_cast<int>(IServiceManager::PidConstant::NO_PID)) {
175                 continue;
176             }
177             if (info.interfaceName == kHidlVhalInterfaceName) {
178                 pid = info.pid;
179                 return;
180             }
181         }
182     });
183 
184     if (!ret.isOk()) {
185         return Error() << "Failed to get VHAL process id from HIDL service manager";
186     }
187     if (pid == -1) {
188         return Error() << "No VHAL service registered to HIDL service manager";
189     }
190     return pid;
191 }
192 
193 }  // namespace
194 
WatchdogProcessService(const sp<Looper> & handlerLooper)195 WatchdogProcessService::WatchdogProcessService(const sp<Looper>& handlerLooper) :
196       WatchdogProcessService(IVhalClient::tryCreate, kDefaultTryGetHidlServiceManager,
197                              getStartTimeForPid, kDefaultVhalPidCachingRetryDelayNs, handlerLooper,
198                              sp<AIBinderDeathRegistrationWrapper>::make()) {}
199 
WatchdogProcessService(const std::function<std::shared_ptr<IVhalClient> ()> & tryCreateVhalClientFunc,const std::function<sp<IServiceManager> ()> & tryGetHidlServiceManagerFunc,const std::function<int64_t (pid_t)> & getStartTimeForPidFunc,const std::chrono::nanoseconds & vhalPidCachingRetryDelayNs,const sp<Looper> & handlerLooper,const sp<AIBinderDeathRegistrationWrapperInterface> & deathRegistrationWrapper)200 WatchdogProcessService::WatchdogProcessService(
201         const std::function<std::shared_ptr<IVhalClient>()>& tryCreateVhalClientFunc,
202         const std::function<sp<IServiceManager>()>& tryGetHidlServiceManagerFunc,
203         const std::function<int64_t(pid_t)>& getStartTimeForPidFunc,
204         const std::chrono::nanoseconds& vhalPidCachingRetryDelayNs, const sp<Looper>& handlerLooper,
205         const sp<AIBinderDeathRegistrationWrapperInterface>& deathRegistrationWrapper) :
206       kTryCreateVhalClientFunc(tryCreateVhalClientFunc),
207       kTryGetHidlServiceManagerFunc(tryGetHidlServiceManagerFunc),
208       kGetStartTimeForPidFunc(getStartTimeForPidFunc),
209       kVhalPidCachingRetryDelayNs(vhalPidCachingRetryDelayNs),
210       mHandlerLooper(handlerLooper),
211       mClientBinderDeathRecipient(
212               ScopedAIBinder_DeathRecipient(AIBinder_DeathRecipient_new(onBinderDied))),
213       mLastSessionId(0),
214       mServiceStarted(false),
215       mDeathRegistrationWrapper(deathRegistrationWrapper),
216       mIsEnabled(true),
217       mVhalService(nullptr),
218       mTotalVhalPidCachingAttempts(0) {
219     mVhalBinderDiedCallback =
220             std::make_shared<IVhalClient::OnBinderDiedCallbackFunc>([this] { handleVhalDeath(); });
221     for (const auto& timeout : kTimeouts) {
222         mClientsByTimeout.insert(std::make_pair(timeout, ClientInfoMap()));
223         mPingedClients.insert(std::make_pair(timeout, PingedClientMap()));
224     }
225 
226     int32_t vhalHealthCheckIntervalSec =
227             GetIntProperty(kPropertyVhalCheckInterval, kDefaultVhalCheckIntervalSec);
228     vhalHealthCheckIntervalSec = std::max(vhalHealthCheckIntervalSec, kDefaultVhalCheckIntervalSec);
229     mVhalHealthCheckWindowMs = std::chrono::seconds(vhalHealthCheckIntervalSec);
230 
231     int32_t clientHealthCheckIntervalSec =
232             GetIntProperty(kPropertyClientCheckInterval, kMissingIntPropertyValue);
233     // Overridden timeout value must be greater than or equal to the maximum possible timeout value.
234     // Otherwise, clients will be pinged more frequently than the guaranteed timeout duration.
235     if (clientHealthCheckIntervalSec != kMissingIntPropertyValue) {
236         int32_t normalSec = std::chrono::duration_cast<std::chrono::seconds>(
237                                     getTimeoutDurationNs(TimeoutLength::TIMEOUT_NORMAL))
238                                     .count();
239         mOverriddenClientHealthCheckWindowNs = std::optional<std::chrono::seconds>{
240                 std::max(clientHealthCheckIntervalSec, normalSec)};
241     }
242 }
243 
registerClient(const std::shared_ptr<ICarWatchdogClient> & client,TimeoutLength timeout)244 ScopedAStatus WatchdogProcessService::registerClient(
245         const std::shared_ptr<ICarWatchdogClient>& client, TimeoutLength timeout) {
246     if (client == nullptr) {
247         return ScopedAStatus::fromExceptionCodeWithMessage(EX_ILLEGAL_ARGUMENT,
248                                                            "Must provide non-null client");
249     }
250     pid_t callingPid = IPCThreadState::self()->getCallingPid();
251     uid_t callingUid = IPCThreadState::self()->getCallingUid();
252 
253     ClientInfo clientInfo(client, callingPid, callingUid, kGetStartTimeForPidFunc(callingPid),
254                           *this);
255     return toScopedAStatus(registerClient(clientInfo, timeout));
256 }
257 
unregisterClient(const std::shared_ptr<ICarWatchdogClient> & client)258 ScopedAStatus WatchdogProcessService::unregisterClient(
259         const std::shared_ptr<ICarWatchdogClient>& client) {
260     if (client == nullptr) {
261         return ScopedAStatus::fromExceptionCodeWithMessage(EX_ILLEGAL_ARGUMENT,
262                                                            "Must provide non-null client");
263     }
264     Mutex::Autolock lock(mMutex);
265     return unregisterClientLocked(kTimeouts, client->asBinder(), ClientType::Regular);
266 }
267 
registerCarWatchdogService(const SpAIBinder & binder,const sp<WatchdogServiceHelperInterface> & helper)268 ScopedAStatus WatchdogProcessService::registerCarWatchdogService(
269         const SpAIBinder& binder, const sp<WatchdogServiceHelperInterface>& helper) {
270     pid_t callingPid = IPCThreadState::self()->getCallingPid();
271     uid_t callingUid = IPCThreadState::self()->getCallingUid();
272 
273     if (helper == nullptr) {
274         return ScopedAStatus::
275                 fromExceptionCodeWithMessage(EX_ILLEGAL_ARGUMENT,
276                                              "Watchdog service helper instance is null");
277     }
278     ClientInfo clientInfo(helper, binder, callingPid, callingUid,
279                           kGetStartTimeForPidFunc(callingPid), *this);
280     if (auto result = registerClient(clientInfo, kCarWatchdogServiceTimeoutDelay); !result.ok()) {
281         return toScopedAStatus(result);
282     }
283     Mutex::Autolock lock(mMutex);
284     if (mNotSupportedVhalProperties.count(VehicleProperty::VHAL_HEARTBEAT) == 0 &&
285         mVhalService != nullptr && mVhalService->isAidlVhal() &&
286         !mVhalProcessIdentifier.has_value()) {
287         // When CarService is restarted in the middle handling the AIDL VHAL pid fetch request,
288         // the request will fail. Restart the caching process only when the AIDL VHAL pid is
289         // missing.
290         mTotalVhalPidCachingAttempts = 0;
291         mHandlerLooper->sendMessage(mMessageHandler, Message(MSG_CACHE_VHAL_PROCESS_IDENTIFIER));
292     }
293     return ScopedAStatus::ok();
294 }
295 
unregisterCarWatchdogService(const SpAIBinder & binder)296 void WatchdogProcessService::unregisterCarWatchdogService(const SpAIBinder& binder) {
297     Mutex::Autolock lock(mMutex);
298 
299     std::vector<TimeoutLength> timeouts = {TimeoutLength::TIMEOUT_CRITICAL};
300     unregisterClientLocked(timeouts, binder, ClientType::Service);
301 }
302 
registerMonitor(const std::shared_ptr<ICarWatchdogMonitor> & monitor)303 ScopedAStatus WatchdogProcessService::registerMonitor(
304         const std::shared_ptr<ICarWatchdogMonitor>& monitor) {
305     if (monitor == nullptr) {
306         return ScopedAStatus::fromExceptionCodeWithMessage(EX_ILLEGAL_ARGUMENT,
307                                                            "Must provide non-null monitor");
308     }
309     const auto binder = monitor->asBinder();
310     {
311         Mutex::Autolock lock(mMutex);
312         if (mMonitor != nullptr) {
313             if (mMonitor->asBinder() == binder) {
314                 return ScopedAStatus::ok();
315             }
316             AIBinder* aiBinder = mMonitor->asBinder().get();
317             mDeathRegistrationWrapper->unlinkToDeath(aiBinder, mClientBinderDeathRecipient.get(),
318                                                      static_cast<void*>(aiBinder));
319         }
320         mMonitor = monitor;
321     }
322 
323     AIBinder* aiBinder = binder.get();
324     auto status =
325             mDeathRegistrationWrapper->linkToDeath(aiBinder, mClientBinderDeathRecipient.get(),
326                                                    static_cast<void*>(aiBinder));
327     if (!status.isOk()) {
328         {
329             Mutex::Autolock lock(mMutex);
330             if (mMonitor != nullptr && mMonitor->asBinder() == binder) {
331                 mMonitor.reset();
332             }
333         }
334         ALOGW("Failed to register the monitor as it is dead.");
335         return ScopedAStatus::fromExceptionCodeWithMessage(EX_ILLEGAL_STATE,
336                                                            "The monitor is dead.");
337     }
338     if (DEBUG) {
339         ALOGD("Car watchdog monitor is registered");
340     }
341     return ScopedAStatus::ok();
342 }
343 
unregisterMonitor(const std::shared_ptr<ICarWatchdogMonitor> & monitor)344 ScopedAStatus WatchdogProcessService::unregisterMonitor(
345         const std::shared_ptr<ICarWatchdogMonitor>& monitor) {
346     if (monitor == nullptr) {
347         return ScopedAStatus::fromExceptionCodeWithMessage(EX_ILLEGAL_ARGUMENT,
348                                                            "Must provide non-null monitor");
349     }
350     const auto binder = monitor->asBinder();
351     Mutex::Autolock lock(mMutex);
352     if (mMonitor == nullptr || mMonitor->asBinder() != binder) {
353         ALOGW("Failed to unregister the monitor as it has not been registered.");
354         return ScopedAStatus::fromExceptionCodeWithMessage(EX_ILLEGAL_ARGUMENT,
355                                                            "The monitor has not been registered.");
356     }
357     AIBinder* aiBinder = binder.get();
358     mDeathRegistrationWrapper->unlinkToDeath(aiBinder, mClientBinderDeathRecipient.get(),
359                                              static_cast<void*>(aiBinder));
360     mMonitor.reset();
361     if (DEBUG) {
362         ALOGD("Car watchdog monitor is unregistered");
363     }
364     return ScopedAStatus::ok();
365 }
366 
tellClientAlive(const std::shared_ptr<ICarWatchdogClient> & client,int32_t sessionId)367 ScopedAStatus WatchdogProcessService::tellClientAlive(
368         const std::shared_ptr<ICarWatchdogClient>& client, int32_t sessionId) {
369     if (client == nullptr) {
370         return ScopedAStatus::fromExceptionCodeWithMessage(EX_ILLEGAL_ARGUMENT,
371                                                            "Must provide non-null client");
372     }
373     Mutex::Autolock lock(mMutex);
374     return tellClientAliveLocked(client->asBinder(), sessionId);
375 }
376 
tellCarWatchdogServiceAlive(const std::shared_ptr<ICarWatchdogServiceForSystem> & service,const std::vector<ProcessIdentifier> & clientsNotResponding,int32_t sessionId)377 ScopedAStatus WatchdogProcessService::tellCarWatchdogServiceAlive(
378         const std::shared_ptr<ICarWatchdogServiceForSystem>& service,
379         const std::vector<ProcessIdentifier>& clientsNotResponding, int32_t sessionId) {
380     if (service == nullptr) {
381         return ScopedAStatus::
382                 fromExceptionCodeWithMessage(EX_ILLEGAL_ARGUMENT,
383                                              "Must provide non-null car watchdog service");
384     }
385     ScopedAStatus status;
386     {
387         Mutex::Autolock lock(mMutex);
388         if (DEBUG) {
389             if (clientsNotResponding.size() > 0) {
390                 ALOGD("CarWatchdogService(session: %d) responded with non-responding clients: %s",
391                       sessionId, toPidString(clientsNotResponding).c_str());
392             }
393         }
394         status = tellClientAliveLocked(service->asBinder(), sessionId);
395     }
396     if (status.isOk()) {
397         dumpAndKillAllProcesses(clientsNotResponding, /*reportToVhal=*/true);
398     }
399     return status;
400 }
401 
tellDumpFinished(const std::shared_ptr<ICarWatchdogMonitor> & monitor,const ProcessIdentifier & processIdentifier)402 ScopedAStatus WatchdogProcessService::tellDumpFinished(
403         const std::shared_ptr<ICarWatchdogMonitor>& monitor,
404         const ProcessIdentifier& processIdentifier) {
405     Mutex::Autolock lock(mMutex);
406     if (mMonitor == nullptr || monitor == nullptr || mMonitor->asBinder() != monitor->asBinder()) {
407         return ScopedAStatus::fromExceptionCodeWithMessage(EX_ILLEGAL_ARGUMENT,
408                                                            "The monitor is not registered or an "
409                                                            "invalid monitor is given");
410     }
411     ALOGI("Process(pid: %d) has been dumped and killed", processIdentifier.pid);
412     return ScopedAStatus::ok();
413 }
414 
setEnabled(bool isEnabled)415 void WatchdogProcessService::setEnabled(bool isEnabled) {
416     Mutex::Autolock lock(mMutex);
417     if (mIsEnabled == isEnabled) {
418         return;
419     }
420     ALOGI("%s is %s", kServiceName, isEnabled ? "enabled" : "disabled");
421     mIsEnabled = isEnabled;
422     mHandlerLooper->removeMessages(mMessageHandler, MSG_VHAL_HEALTH_CHECK);
423     if (!mIsEnabled) {
424         return;
425     }
426     if (mNotSupportedVhalProperties.count(VehicleProperty::VHAL_HEARTBEAT) == 0) {
427         mVhalHeartBeat.eventTime = uptimeMillis();
428         std::chrono::nanoseconds intervalNs = mVhalHealthCheckWindowMs + kHealthCheckDelayMs;
429         mHandlerLooper->sendMessageDelayed(intervalNs.count(), mMessageHandler,
430                                            Message(MSG_VHAL_HEALTH_CHECK));
431     }
432     for (const auto& timeout : kTimeouts) {
433         mHandlerLooper->removeMessages(mMessageHandler, static_cast<int>(timeout));
434         startHealthCheckingLocked(timeout);
435     }
436 }
437 
onUserStateChange(userid_t userId,bool isStarted)438 void WatchdogProcessService::onUserStateChange(userid_t userId, bool isStarted) {
439     std::string buffer;
440     Mutex::Autolock lock(mMutex);
441     if (isStarted) {
442         mStoppedUserIds.erase(userId);
443     } else {
444         mStoppedUserIds.insert(userId);
445     }
446 }
447 
onDump(int fd)448 void WatchdogProcessService::onDump(int fd) {
449     Mutex::Autolock lock(mMutex);
450     const char* indent = "  ";
451     const char* doubleIndent = "    ";
452     std::string buffer;
453     WriteStringToFd("CAR WATCHDOG PROCESS SERVICE\n", fd);
454     WriteStringToFd(StringPrintf("%s%s enabled: %s\n", indent, kServiceName,
455                                  mIsEnabled ? "true" : "false"),
456                     fd);
457     WriteStringToFd(StringPrintf("%sRegistered clients\n", indent), fd);
458     int count = 1;
459     for (const auto& timeout : kTimeouts) {
460         ClientInfoMap& clients = mClientsByTimeout[timeout];
461         for (auto it = clients.begin(); it != clients.end(); it++, count++) {
462             WriteStringToFd(StringPrintf("%sClient #%d: %s\n", doubleIndent, count,
463                                          it->second.toString().c_str()),
464                             fd);
465         }
466     }
467     WriteStringToFd(StringPrintf("%sMonitor registered: %s\n", indent,
468                                  mMonitor == nullptr ? "false" : "true"),
469                     fd);
470     WriteStringToFd(StringPrintf("%sisSystemShuttingDown: %s\n", indent,
471                                  isSystemShuttingDown() ? "true" : "false"),
472                     fd);
473     buffer = "none";
474     bool first = true;
475     for (const auto& userId : mStoppedUserIds) {
476         if (first) {
477             buffer = StringPrintf("%d", userId);
478             first = false;
479         } else {
480             StringAppendF(&buffer, ", %d", userId);
481         }
482     }
483     WriteStringToFd(StringPrintf("%sStopped users: %s\n", indent, buffer.c_str()), fd);
484     if (mVhalService != nullptr &&
485         mNotSupportedVhalProperties.count(VehicleProperty::VHAL_HEARTBEAT) == 0) {
486         int64_t systemUptime = uptimeMillis();
487         WriteStringToFd(StringPrintf("%sVHAL health check is supported:\n%s\tVHAL health check "
488                                      "interval: %lld ms\n%s\tVHAL heartbeat was updated %" PRIi64
489                                      " ms ago",
490                                      indent, indent, mVhalHealthCheckWindowMs.count(), indent,
491                                      systemUptime - mVhalHeartBeat.eventTime),
492                         fd);
493         std::string vhalType = mVhalService->isAidlVhal() ? "AIDL" : "HIDL";
494         if (mVhalProcessIdentifier.has_value()) {
495             WriteStringToFd(StringPrintf("%s%s VHAL process identifier (PID = %d, Start time "
496                                          "millis = "
497                                          "%" PRIi64 ")",
498                                          indent, vhalType.c_str(), mVhalProcessIdentifier->pid,
499                                          mVhalProcessIdentifier->startTimeMillis),
500                             fd);
501         } else if (mTotalVhalPidCachingAttempts < kMaxVhalPidCachingAttempts) {
502             WriteStringToFd(StringPrintf("%sStill fetching %s VHAL process identifier. "
503                                          "Total attempts made = %d, Remaining attempts = %d",
504                                          indent, vhalType.c_str(), mTotalVhalPidCachingAttempts,
505                                          kMaxVhalPidCachingAttempts - mTotalVhalPidCachingAttempts),
506                             fd);
507         } else {
508             WriteStringToFd(StringPrintf("%sFailed to fetch %s VHAL process identifier. "
509                                          "Cannot terminate VHAL when VHAL becomes unresponsive",
510                                          indent, vhalType.c_str()),
511                             fd);
512         }
513     } else if (mVhalService != nullptr) {
514         WriteStringToFd(StringPrintf("%sVHAL client is connected but the heartbeat property is not "
515                                      "supported",
516                                      indent),
517                         fd);
518     } else {
519         WriteStringToFd(StringPrintf("%sVHAL client is not connected", indent), fd);
520     }
521 }
522 
doHealthCheck(int what)523 void WatchdogProcessService::doHealthCheck(int what) {
524     mHandlerLooper->removeMessages(mMessageHandler, what);
525     if (Mutex::Autolock lock(mMutex); !mIsEnabled) {
526         return;
527     }
528     const TimeoutLength timeout = static_cast<TimeoutLength>(what);
529     dumpAndKillClientsIfNotResponding(timeout);
530 
531     /* Generates a temporary/local vector containing clients.
532      * Using a local copy may send unnecessary ping messages to clients after they are unregistered.
533      * Clients should be able to handle them.
534      */
535     std::vector<ClientInfo> clientsToCheck;
536     PingedClientMap* pingedClients = nullptr;
537     {
538         Mutex::Autolock lock(mMutex);
539         pingedClients = &mPingedClients[timeout];
540         pingedClients->clear();
541         for (auto& [_, clientInfo] : mClientsByTimeout[timeout]) {
542             if (mStoppedUserIds.count(clientInfo.kUserId) > 0) {
543                 continue;
544             }
545             int sessionId = getNewSessionId();
546             clientInfo.sessionId = sessionId;
547             clientsToCheck.push_back(clientInfo);
548             pingedClients->insert(std::make_pair(sessionId, clientInfo));
549         }
550     }
551 
552     for (const auto& clientInfo : clientsToCheck) {
553         if (auto status = clientInfo.checkIfAlive(timeout); !status.isOk()) {
554             if (DEBUG) {
555                 ALOGW("Failed to send a ping message to client(pid: %d): %s", clientInfo.kPid,
556                       status.getMessage());
557             }
558             {
559                 Mutex::Autolock lock(mMutex);
560                 pingedClients->erase(clientInfo.sessionId);
561             }
562         }
563     }
564     // Though the size of pingedClients is a more specific measure, clientsToCheck is used as a
565     // conservative approach.
566     if (clientsToCheck.size() > 0) {
567         auto durationNs = getTimeoutDurationNs(timeout);
568         mHandlerLooper->sendMessageDelayed(durationNs.count(), mMessageHandler, Message(what));
569     }
570 }
571 
start()572 Result<void> WatchdogProcessService::start() {
573     if (mServiceStarted) {
574         return Error(INVALID_OPERATION) << "Cannot start process monitoring more than once";
575     }
576     auto thiz = sp<WatchdogProcessService>::fromExisting(this);
577     mMessageHandler = sp<MessageHandlerImpl>::make(thiz);
578     mPropertyChangeListener = std::make_shared<PropertyChangeListener>(thiz);
579     mServiceStarted = true;
580     reportWatchdogAliveToVhal();
581     return {};
582 }
583 
terminate()584 void WatchdogProcessService::terminate() {
585     std::unique_ptr<ISubscriptionClient> propertySubscriptionClient;
586     {
587         Mutex::Autolock lock(mMutex);
588         if (!mServiceStarted) {
589             return;
590         }
591         for (auto& [_, clients] : mClientsByTimeout) {
592             for (auto& [_, client] : clients) {
593                 client.unlinkToDeath(mClientBinderDeathRecipient.get());
594             }
595             clients.clear();
596         }
597         mClientsByTimeout.clear();
598         if (mMonitor != nullptr) {
599             AIBinder* aiBinder = mMonitor->asBinder().get();
600             mDeathRegistrationWrapper->unlinkToDeath(aiBinder, mClientBinderDeathRecipient.get(),
601                                                      static_cast<void*>(aiBinder));
602             mMonitor.reset();
603         }
604         mHandlerLooper->removeMessages(mMessageHandler, MSG_VHAL_HEALTH_CHECK);
605         mServiceStarted = false;
606         if (mVhalService == nullptr) {
607             return;
608         }
609         if (mNotSupportedVhalProperties.count(VehicleProperty::VHAL_HEARTBEAT) == 0) {
610             propertySubscriptionClient =
611                     mVhalService->getSubscriptionClient(mPropertyChangeListener);
612         }
613         mVhalService->removeOnBinderDiedCallback(mVhalBinderDiedCallback);
614         resetVhalInfoLocked();
615     }
616     if (propertySubscriptionClient != nullptr) {
617         std::vector<int32_t> propIds = {static_cast<int32_t>(VehicleProperty::VHAL_HEARTBEAT)};
618         auto result = propertySubscriptionClient->unsubscribe(propIds);
619         if (!result.ok()) {
620             ALOGW("Failed to unsubscribe from VHAL_HEARTBEAT.");
621         }
622     }
623 }
624 
registerClient(const ClientInfo & clientInfo,TimeoutLength timeout)625 Result<void> WatchdogProcessService::registerClient(const ClientInfo& clientInfo,
626                                                     TimeoutLength timeout) {
627     uintptr_t cookieId = reinterpret_cast<uintptr_t>(clientInfo.getAIBinder());
628     {
629         Mutex::Autolock lock(mMutex);
630         if (findClientAndProcessLocked(kTimeouts, clientInfo.getAIBinder(), nullptr)) {
631             return Error(RegistrationError::ERR_DUPLICATE_REGISTRATION)
632                     << "Failed to register (" << clientInfo.toString()
633                     << ") as it is already registered";
634         }
635 
636         ClientInfoMap& clients = mClientsByTimeout[timeout];
637         clients.insert(std::make_pair(cookieId, clientInfo));
638     }
639     if (auto status = clientInfo.linkToDeath(mClientBinderDeathRecipient.get()); !status.isOk()) {
640         Mutex::Autolock lock(mMutex);
641         if (auto it = mClientsByTimeout.find(timeout); it != mClientsByTimeout.end()) {
642             if (const auto& clientIt = it->second.find(cookieId); clientIt != it->second.end()) {
643                 it->second.erase(clientIt);
644             }
645         }
646         return Error(RegistrationError::ERR_ILLEGAL_STATE)
647                 << "Failed to register (" << clientInfo.toString() << ") as it is dead";
648     }
649     if (DEBUG) {
650         ALOGD("Car watchdog client (%s, timeout = %d) is registered", clientInfo.toString().c_str(),
651               timeout);
652     }
653     Mutex::Autolock lock(mMutex);
654     // If the client array becomes non-empty, start health checking.
655     if (mClientsByTimeout[timeout].size() == 1) {
656         startHealthCheckingLocked(timeout);
657         ALOGI("Starting health checking for timeout = %d", timeout);
658     }
659     return {};
660 }
661 
unregisterClientLocked(const std::vector<TimeoutLength> & timeouts,const SpAIBinder & binder,ClientType clientType)662 ScopedAStatus WatchdogProcessService::unregisterClientLocked(
663         const std::vector<TimeoutLength>& timeouts, const SpAIBinder& binder,
664         ClientType clientType) {
665     const char* clientName = clientType == ClientType::Regular ? "client" : "service";
666     bool result = findClientAndProcessLocked(timeouts, binder.get(),
667                                              [&](ClientInfoMap& clients,
668                                                  ClientInfoMap::const_iterator it) {
669                                                  it->second.unlinkToDeath(
670                                                          mClientBinderDeathRecipient.get());
671                                                  clients.erase(it);
672                                              });
673     if (!result) {
674         std::string errorStr =
675                 StringPrintf("The car watchdog %s has not been registered", clientName);
676         const char* errorCause = errorStr.c_str();
677         ALOGW("Failed to unregister the car watchdog %s: %s", clientName, errorCause);
678         return ScopedAStatus::fromExceptionCodeWithMessage(EX_ILLEGAL_ARGUMENT, errorCause);
679     }
680     if (DEBUG) {
681         ALOGD("Car watchdog %s is unregistered", clientName);
682     }
683     return ScopedAStatus::ok();
684 }
685 
tellClientAliveLocked(const SpAIBinder & binder,int32_t sessionId)686 ScopedAStatus WatchdogProcessService::tellClientAliveLocked(const SpAIBinder& binder,
687                                                             int32_t sessionId) {
688     for (const auto& timeout : kTimeouts) {
689         PingedClientMap& clients = mPingedClients[timeout];
690         PingedClientMap::const_iterator it = clients.find(sessionId);
691         if (it == clients.cend() || it->second.getAIBinder() != binder.get()) {
692             continue;
693         }
694         clients.erase(it);
695         return ScopedAStatus::ok();
696     }
697     return ScopedAStatus::fromExceptionCodeWithMessage(EX_ILLEGAL_ARGUMENT,
698                                                        "The client is not registered or the "
699                                                        "session ID is not found");
700 }
701 
findClientAndProcessLocked(const std::vector<TimeoutLength> & timeouts,AIBinder * aiBinder,const Processor & processor)702 bool WatchdogProcessService::findClientAndProcessLocked(const std::vector<TimeoutLength>& timeouts,
703                                                         AIBinder* aiBinder,
704                                                         const Processor& processor) {
705     return findClientAndProcessLocked(timeouts, reinterpret_cast<uintptr_t>(aiBinder), processor);
706 }
707 
findClientAndProcessLocked(const std::vector<TimeoutLength> & timeouts,uintptr_t binderPtrId,const Processor & processor)708 bool WatchdogProcessService::findClientAndProcessLocked(const std::vector<TimeoutLength>& timeouts,
709                                                         uintptr_t binderPtrId,
710                                                         const Processor& processor) {
711     for (const auto& timeout : timeouts) {
712         auto clientsByIdIt = mClientsByTimeout.find(timeout);
713         if (clientsByIdIt == mClientsByTimeout.end()) {
714             continue;
715         }
716         auto it = clientsByIdIt->second.find(binderPtrId);
717         if (it == clientsByIdIt->second.end()) {
718             continue;
719         }
720         if (processor != nullptr) {
721             processor(clientsByIdIt->second, it);
722         }
723         return true;
724     }
725 
726     return false;
727 }
728 
startHealthCheckingLocked(TimeoutLength timeout)729 Result<void> WatchdogProcessService::startHealthCheckingLocked(TimeoutLength timeout) {
730     PingedClientMap& clients = mPingedClients[timeout];
731     clients.clear();
732     int what = static_cast<int>(timeout);
733     auto durationNs = getTimeoutDurationNs(timeout);
734     mHandlerLooper->sendMessageDelayed(durationNs.count(), mMessageHandler, Message(what));
735     return {};
736 }
737 
dumpAndKillClientsIfNotResponding(TimeoutLength timeout)738 Result<void> WatchdogProcessService::dumpAndKillClientsIfNotResponding(TimeoutLength timeout) {
739     std::vector<ProcessIdentifier> processIdentifiers;
740     std::vector<const ClientInfo*> clientsToNotify;
741     {
742         Mutex::Autolock lock(mMutex);
743         PingedClientMap& clients = mPingedClients[timeout];
744         for (PingedClientMap::const_iterator it = clients.cbegin(); it != clients.cend(); it++) {
745             pid_t pid = -1;
746             userid_t userId = -1;
747             uint64_t startTimeMillis = 0;
748             std::vector<TimeoutLength> timeouts = {timeout};
749             findClientAndProcessLocked(timeouts, it->second.getAIBinder(),
750                                        [&](ClientInfoMap& cachedClients,
751                                            ClientInfoMap::const_iterator cachedClientsIt) {
752                                            auto clientInfo = cachedClientsIt->second;
753                                            pid = clientInfo.kPid;
754                                            startTimeMillis = clientInfo.kStartTimeMillis;
755                                            userId = clientInfo.kUserId;
756                                            clientInfo.unlinkToDeath(
757                                                    mClientBinderDeathRecipient.get());
758                                            cachedClients.erase(cachedClientsIt);
759                                        });
760             if (pid != -1 && mStoppedUserIds.count(userId) == 0) {
761                 clientsToNotify.emplace_back(&it->second);
762                 ProcessIdentifier processIdentifier;
763                 processIdentifier.pid = pid;
764                 processIdentifier.startTimeMillis = startTimeMillis;
765                 processIdentifiers.push_back(processIdentifier);
766             }
767         }
768     }
769     for (const ClientInfo*& clientInfo : clientsToNotify) {
770         clientInfo->prepareProcessTermination();
771     }
772     return dumpAndKillAllProcesses(processIdentifiers, /*reportToVhal=*/true);
773 }
774 
dumpAndKillAllProcesses(const std::vector<ProcessIdentifier> & processesNotResponding,bool reportToVhal)775 Result<void> WatchdogProcessService::dumpAndKillAllProcesses(
776         const std::vector<ProcessIdentifier>& processesNotResponding, bool reportToVhal) {
777     size_t size = processesNotResponding.size();
778     if (size == 0) {
779         return {};
780     }
781     std::string pidString = toPidString(processesNotResponding);
782     std::shared_ptr<ICarWatchdogMonitor> monitor;
783     {
784         Mutex::Autolock lock(mMutex);
785         if (mMonitor == nullptr) {
786             std::string errorMsg =
787                     StringPrintf("Failed to dump and kill processes(pid = %s): Monitor is not set",
788                                  pidString.c_str());
789             ALOGW("%s", errorMsg.c_str());
790             return Error() << errorMsg;
791         }
792         monitor = mMonitor;
793     }
794     if (isSystemShuttingDown()) {
795         ALOGI("Skip dumping and killing processes(%s): The system is shutting down",
796               pidString.c_str());
797         return {};
798     }
799     if (reportToVhal) {
800         reportTerminatedProcessToVhal(processesNotResponding);
801     }
802     monitor->onClientsNotResponding(processesNotResponding);
803     if (DEBUG) {
804         ALOGD("Dumping and killing processes is requested: %s", pidString.c_str());
805     }
806     return {};
807 }
808 
809 // Handle when car watchdog clients die.
handleBinderDeath(void * cookie)810 void WatchdogProcessService::handleBinderDeath(void* cookie) {
811     uintptr_t cookieId = reinterpret_cast<uintptr_t>(cookie);
812 
813     // The same binder death recipient is used for both monitor and client deaths. So, check both
814     // the monitor and all the clients until a match is found.
815     Mutex::Autolock lock(mMutex);
816     if (mMonitor != nullptr) {
817         if (AIBinder* aiBinder = mMonitor->asBinder().get();
818             reinterpret_cast<uintptr_t>(aiBinder) == cookieId) {
819             mMonitor.reset();
820             ALOGW("The monitor has died.");
821             return;
822         }
823     }
824 
825     findClientAndProcessLocked(kTimeouts, cookieId,
826                                [&](ClientInfoMap& clients, ClientInfoMap::const_iterator it) {
827                                    ALOGW("Client(pid: %d) died", it->second.kPid);
828                                    clients.erase(it);
829                                });
830 }
831 
832 // Handle when VHAL dies.
handleVhalDeath()833 void WatchdogProcessService::handleVhalDeath() {
834     Mutex::Autolock lock(mMutex);
835     ALOGW("VHAL has died.");
836     mHandlerLooper->removeMessages(mMessageHandler, MSG_VHAL_HEALTH_CHECK);
837     // Destroying mVHalService would remove all onBinderDied callbacks.
838     resetVhalInfoLocked();
839 }
840 
reportWatchdogAliveToVhal()841 void WatchdogProcessService::reportWatchdogAliveToVhal() {
842     if (mNotSupportedVhalProperties.count(VehicleProperty::WATCHDOG_ALIVE) > 0) {
843         ALOGW("VHAL doesn't support WATCHDOG_ALIVE. Car watchdog will not update WATCHDOG_ALIVE.");
844         return;
845     }
846     int64_t systemUptime = uptimeMillis();
847     VehiclePropValue propValue{
848             .prop = static_cast<int32_t>(VehicleProperty::WATCHDOG_ALIVE),
849             .value.int64Values = {systemUptime},
850     };
851     const auto& ret = updateVhal(propValue);
852     if (!ret.ok()) {
853         ALOGW("Failed to update WATCHDOG_ALIVE VHAL property. Will try again in 3s, error: %s",
854               ret.error().message().c_str());
855     }
856     // Update VHAL with the interval of TIMEOUT_CRITICAL(3s).
857     auto durationNs = getTimeoutDurationNs(TimeoutLength::TIMEOUT_CRITICAL);
858     mHandlerLooper->removeMessages(mMessageHandler, MSG_VHAL_WATCHDOG_ALIVE);
859     mHandlerLooper->sendMessageDelayed(durationNs.count(), mMessageHandler,
860                                        Message(MSG_VHAL_WATCHDOG_ALIVE));
861 }
862 
reportTerminatedProcessToVhal(const std::vector<ProcessIdentifier> & processesNotResponding)863 void WatchdogProcessService::reportTerminatedProcessToVhal(
864         const std::vector<ProcessIdentifier>& processesNotResponding) {
865     if (mNotSupportedVhalProperties.count(VehicleProperty::WATCHDOG_TERMINATED_PROCESS) > 0) {
866         ALOGW("VHAL doesn't support WATCHDOG_TERMINATED_PROCESS. Terminated process is not "
867               "reported to VHAL.");
868         return;
869     }
870     for (auto&& processIdentifier : processesNotResponding) {
871         const auto& retCmdLine = readProcCmdLine(processIdentifier.pid);
872         if (!retCmdLine.ok()) {
873             ALOGW("Failed to get process command line for pid(%d): %s", processIdentifier.pid,
874                   retCmdLine.error().message().c_str());
875             continue;
876         }
877         std::string procCmdLine = retCmdLine.value();
878         VehiclePropValue propValue{
879                 .prop = static_cast<int32_t>(VehicleProperty::WATCHDOG_TERMINATED_PROCESS),
880                 .value.int32Values = {static_cast<int32_t>(
881                         ProcessTerminationReason::NOT_RESPONDING)},
882                 .value.stringValue = procCmdLine,
883         };
884         const auto& retUpdate = updateVhal(propValue);
885         if (!retUpdate.ok()) {
886             ALOGW("Failed to update WATCHDOG_TERMINATED_PROCESS VHAL property(command line: %s)",
887                   procCmdLine.c_str());
888         }
889     }
890 }
891 
updateVhal(const VehiclePropValue & value)892 Result<void> WatchdogProcessService::updateVhal(const VehiclePropValue& value) {
893     const auto& connectRet = connectToVhal();
894     if (!connectRet.ok()) {
895         std::string errorMsg = "VHAL is not connected: " + connectRet.error().message();
896         ALOGW("%s", errorMsg.c_str());
897         return Error() << errorMsg;
898     }
899     int32_t propId = value.prop;
900     std::shared_ptr<IVhalClient> vhalService;
901     {
902         Mutex::Autolock lock(mMutex);
903         if (mNotSupportedVhalProperties.count(static_cast<VehicleProperty>(propId)) > 0) {
904             std::string errorMsg = StringPrintf("VHAL doesn't support property(id: %d)", propId);
905             ALOGW("%s", errorMsg.c_str());
906             return Error() << errorMsg;
907         }
908         vhalService = mVhalService;
909     }
910 
911     auto halPropValue = vhalService->createHalPropValue(propId);
912     halPropValue->setInt32Values(value.value.int32Values);
913     halPropValue->setInt64Values(value.value.int64Values);
914     halPropValue->setStringValue(value.value.stringValue);
915     if (auto result = vhalService->setValueSync(*halPropValue); !result.ok()) {
916         return Error() << "Failed to set propValue(" << propId
917                        << ") to VHAL, error: " << result.error().message();
918     }
919 
920     return {};
921 }
922 
readProcCmdLine(int32_t pid)923 Result<std::string> WatchdogProcessService::readProcCmdLine(int32_t pid) {
924     std::string cmdLinePath = StringPrintf("/proc/%d/cmdline", pid);
925     std::string procCmdLine;
926     if (ReadFileToString(cmdLinePath, &procCmdLine)) {
927         std::replace(procCmdLine.begin(), procCmdLine.end(), '\0', ' ');
928         procCmdLine = Trim(procCmdLine);
929         return procCmdLine;
930     }
931     return Error() << "Failed to read " << cmdLinePath;
932 }
933 
connectToVhal()934 Result<void> WatchdogProcessService::connectToVhal() {
935     {
936         Mutex::Autolock lock(mMutex);
937         if (mVhalService != nullptr) {
938             return {};
939         }
940         mVhalService = kTryCreateVhalClientFunc();
941         if (mVhalService == nullptr) {
942             return Error() << "Failed to connect to VHAL.";
943         }
944         mVhalService->addOnBinderDiedCallback(mVhalBinderDiedCallback);
945     }
946     queryVhalProperties();
947     subscribeToVhalHeartBeat();
948     ALOGI("Successfully connected to VHAL.");
949     return {};
950 }
951 
queryVhalProperties()952 void WatchdogProcessService::queryVhalProperties() {
953     std::shared_ptr<IVhalClient> vhalService;
954     {
955         Mutex::Autolock lock(mMutex);
956         vhalService = mVhalService;
957     }
958     std::unordered_set<VehicleProperty> notSupportedProperties;
959     std::vector<VehicleProperty> propIds = {VehicleProperty::WATCHDOG_ALIVE,
960                                             VehicleProperty::WATCHDOG_TERMINATED_PROCESS,
961                                             VehicleProperty::VHAL_HEARTBEAT};
962     for (const auto& propId : propIds) {
963         if (auto result = vhalService->getPropConfigs({static_cast<int32_t>(propId)});
964             !result.ok()) {
965             notSupportedProperties.insert(propId);
966         }
967     }
968     {
969         Mutex::Autolock lock(mMutex);
970         mNotSupportedVhalProperties = std::move(notSupportedProperties);
971     }
972 }
973 
subscribeToVhalHeartBeat()974 void WatchdogProcessService::subscribeToVhalHeartBeat() {
975     std::unique_ptr<ISubscriptionClient> propertySubscriptionClient;
976     {
977         Mutex::Autolock lock(mMutex);
978         if (mNotSupportedVhalProperties.count(VehicleProperty::VHAL_HEARTBEAT) > 0) {
979             ALOGW("VHAL doesn't support VHAL_HEARTBEAT. Checking VHAL health is disabled.");
980             return;
981         }
982 
983         mVhalHeartBeat = {
984                 .eventTime = 0,
985                 .value = 0,
986         };
987         propertySubscriptionClient = mVhalService->getSubscriptionClient(mPropertyChangeListener);
988     }
989     std::vector<SubscribeOptions> options = {
990             {.propId = static_cast<int32_t>(VehicleProperty::VHAL_HEARTBEAT), .areaIds = {}},
991     };
992     if (auto result = propertySubscriptionClient->subscribe(options); !result.ok()) {
993         ALOGW("Failed to subscribe to VHAL_HEARTBEAT. Checking VHAL health is disabled. '%s'",
994               result.error().message().c_str());
995         return;
996     }
997     std::chrono::nanoseconds intervalNs = mVhalHealthCheckWindowMs + kHealthCheckDelayMs;
998     mHandlerLooper->sendMessageDelayed(intervalNs.count(), mMessageHandler,
999                                        Message(MSG_VHAL_HEALTH_CHECK));
1000     // VHAL process identifier is required only when terminating the VHAL process. VHAL process is
1001     // terminated only when the VHAL is unhealthy. However, caching the process identifier as soon
1002     // as connecting to VHAL guarantees the correct PID is cached. Because the VHAL pid is queried
1003     // from the service manager, the caching should be performed outside the class level lock. So,
1004     // handle the caching in the handler thread after successfully subscribing to the VHAL_HEARTBEAT
1005     // property.
1006     mHandlerLooper->sendMessage(mMessageHandler, Message(MSG_CACHE_VHAL_PROCESS_IDENTIFIER));
1007     return;
1008 }
1009 
getWatchdogServiceHelperLocked()1010 const sp<WatchdogServiceHelperInterface> WatchdogProcessService::getWatchdogServiceHelperLocked() {
1011     ClientInfoMap& clients = mClientsByTimeout[kCarWatchdogServiceTimeoutDelay];
1012     for (const auto& [_, clientInfo] : clients) {
1013         if (clientInfo.kType == ClientType::Service) {
1014             return clientInfo.kWatchdogServiceHelper;
1015         }
1016     }
1017     return nullptr;
1018 }
1019 
cacheVhalProcessIdentifier()1020 void WatchdogProcessService::cacheVhalProcessIdentifier() {
1021     // Ensure only one MSG_CACHE_VHAL_PROCESS_IDENTIFIER is present on the looper at any given time.
1022     // Duplicate messages could be posted when the CarService restarts during the caching attempts.
1023     // When duplicate messages are present, the following retry delay won't have any effect.
1024     mHandlerLooper->removeMessages(mMessageHandler, MSG_CACHE_VHAL_PROCESS_IDENTIFIER);
1025     bool isAidlVhal;
1026     sp<WatchdogServiceHelperInterface> serviceHelper;
1027     {
1028         Mutex::Autolock lock(mMutex);
1029         if (mVhalService == nullptr || mVhalProcessIdentifier.has_value()) {
1030             return;
1031         }
1032         isAidlVhal = mVhalService->isAidlVhal();
1033         serviceHelper = getWatchdogServiceHelperLocked();
1034         // WatchdogServiceHelper is available only when the CarWatchdogService
1035         // is connected. So, if the WatchdogServiceHelper is not available,
1036         // postpone requesting the AIDL VHAL process identifier from
1037         // CarWatchdogService until the daemon is connected with the service.
1038         if (isAidlVhal && serviceHelper == nullptr) {
1039             if (DEBUG) {
1040                 ALOGE("Skipping requesting AIDL VHAL pid from CarWatchdogService until the service "
1041                       "is connected");
1042             }
1043             return;
1044         }
1045         if (mTotalVhalPidCachingAttempts >= kMaxVhalPidCachingAttempts) {
1046             ALOGE("Failed to cache VHAL process identifier. Total attempts made to cache: %d",
1047                   mTotalVhalPidCachingAttempts);
1048             return;
1049         }
1050         mTotalVhalPidCachingAttempts++;
1051     }
1052     const auto retryCaching = [&](const std::string& logMessage) {
1053         ALOGW("%s. Retrying caching VHAL pid in %lld ms", logMessage.c_str(),
1054               kVhalPidCachingRetryDelayNs.count() / (1'000'000));
1055         mHandlerLooper->sendMessageDelayed(kVhalPidCachingRetryDelayNs.count(), mMessageHandler,
1056                                            Message(MSG_CACHE_VHAL_PROCESS_IDENTIFIER));
1057     };
1058     if (isAidlVhal) {
1059         if (const auto status = serviceHelper->requestAidlVhalPid(); !status.isOk()) {
1060             retryCaching(StringPrintf("Failed to request AIDL VHAL pid from CarWatchdogService: %s",
1061                                       status.getMessage()));
1062             return;
1063         }
1064         // CarWatchdogService responds with the PID via an asynchronous callback. When
1065         // CarWatchdogService cannot respond with the PID, the daemon must retry caching the PID but
1066         // this needs to happen asynchronously. So, post a retry message to ensure that the AIDL
1067         // VHAL PID is returned by the CarWatchdogService within the retry timeout.
1068         retryCaching("Requested AIDL VHAL pid from CarWatchdogService");
1069         return;
1070     }
1071     Result<pid_t> result;
1072     sp<IServiceManager> hidlServiceManager = kTryGetHidlServiceManagerFunc();
1073     if (hidlServiceManager == nullptr) {
1074         retryCaching("Failed to get HIDL service manager");
1075         return;
1076     }
1077     if (result = queryHidlServiceManagerForVhalPid(hidlServiceManager); !result.ok()) {
1078         retryCaching(result.error().message());
1079         return;
1080     }
1081     cacheVhalProcessIdentifierForPid(*result);
1082 }
1083 
onAidlVhalPidFetched(pid_t pid)1084 void WatchdogProcessService::onAidlVhalPidFetched(pid_t pid) {
1085     {
1086         Mutex::Autolock lock(mMutex);
1087         if (mVhalService == nullptr || !mVhalService->isAidlVhal()) {
1088             return;
1089         }
1090     }
1091     cacheVhalProcessIdentifierForPid(pid);
1092 }
1093 
cacheVhalProcessIdentifierForPid(int32_t pid)1094 void WatchdogProcessService::cacheVhalProcessIdentifierForPid(int32_t pid) {
1095     if (pid < 0) {
1096         ALOGE("Ignoring request to cache invalid VHAL pid (%d)", pid);
1097         return;
1098     }
1099     ProcessIdentifier processIdentifier;
1100     processIdentifier.pid = pid;
1101     processIdentifier.startTimeMillis = kGetStartTimeForPidFunc(pid);
1102 
1103     Mutex::Autolock lock(mMutex);
1104     mVhalProcessIdentifier = processIdentifier;
1105     mHandlerLooper->removeMessages(mMessageHandler, MSG_CACHE_VHAL_PROCESS_IDENTIFIER);
1106 }
1107 
getNewSessionId()1108 int32_t WatchdogProcessService::getNewSessionId() {
1109     // Make sure that session id is always positive number.
1110     if (++mLastSessionId <= 0) {
1111         mLastSessionId = 1;
1112     }
1113     return mLastSessionId;
1114 }
1115 
updateVhalHeartBeat(int64_t value)1116 void WatchdogProcessService::updateVhalHeartBeat(int64_t value) {
1117     bool wrongHeartBeat;
1118     {
1119         Mutex::Autolock lock(mMutex);
1120         if (!mIsEnabled) {
1121             return;
1122         }
1123         wrongHeartBeat = value <= mVhalHeartBeat.value;
1124         mVhalHeartBeat.eventTime = uptimeMillis();
1125         mVhalHeartBeat.value = value;
1126     }
1127     if (wrongHeartBeat) {
1128         ALOGW("VHAL updated heart beat with a wrong value. Terminating VHAL...");
1129         terminateVhal();
1130         return;
1131     }
1132     std::chrono::nanoseconds intervalNs = mVhalHealthCheckWindowMs + kHealthCheckDelayMs;
1133     mHandlerLooper->sendMessageDelayed(intervalNs.count(), mMessageHandler,
1134                                        Message(MSG_VHAL_HEALTH_CHECK));
1135 }
1136 
checkVhalHealth()1137 void WatchdogProcessService::checkVhalHealth() {
1138     int64_t lastEventTime;
1139     int64_t currentUptime = uptimeMillis();
1140     {
1141         Mutex::Autolock lock(mMutex);
1142         if (mVhalService == nullptr || !mIsEnabled) {
1143             return;
1144         }
1145         lastEventTime = mVhalHeartBeat.eventTime;
1146     }
1147     if (currentUptime > lastEventTime + mVhalHealthCheckWindowMs.count()) {
1148         ALOGW("VHAL failed to update heart beat within timeout. Terminating VHAL...");
1149         terminateVhal();
1150     }
1151 }
1152 
resetVhalInfoLocked()1153 void WatchdogProcessService::resetVhalInfoLocked() {
1154     mVhalService.reset();
1155     mVhalProcessIdentifier.reset();
1156     mTotalVhalPidCachingAttempts = 0;
1157     // Stop any pending caching attempts when the VHAL info is reset.
1158     mHandlerLooper->removeMessages(mMessageHandler, MSG_CACHE_VHAL_PROCESS_IDENTIFIER);
1159 }
1160 
terminateVhal()1161 void WatchdogProcessService::terminateVhal() {
1162     std::optional<ProcessIdentifier> processIdentifier;
1163     {
1164         Mutex::Autolock lock(mMutex);
1165         processIdentifier = mVhalProcessIdentifier;
1166         resetVhalInfoLocked();
1167         if (!processIdentifier.has_value()) {
1168             ALOGE("Failed to terminate VHAL: failed to fetch VHAL PID");
1169             return;
1170         }
1171     }
1172     dumpAndKillAllProcesses(std::vector<ProcessIdentifier>(1, *processIdentifier),
1173                             /*reportToVhal=*/false);
1174 }
1175 
getTimeoutDurationNs(const TimeoutLength & timeout)1176 std::chrono::nanoseconds WatchdogProcessService::getTimeoutDurationNs(
1177         const TimeoutLength& timeout) {
1178     // When a default timeout has been overridden by the |kPropertyClientCheckInterval| read-only
1179     // property override the timeout value for all timeout lengths.
1180     if (mOverriddenClientHealthCheckWindowNs.has_value()) {
1181         return mOverriddenClientHealthCheckWindowNs.value();
1182     }
1183     switch (timeout) {
1184         case TimeoutLength::TIMEOUT_CRITICAL:
1185             return 3s;  // 3s and no buffer time.
1186         case TimeoutLength::TIMEOUT_MODERATE:
1187             return 6s;  // 5s + 1s as buffer time.
1188         case TimeoutLength::TIMEOUT_NORMAL:
1189             return 12s;  // 10s + 2s as buffer time.
1190     }
1191 }
1192 
toString() const1193 std::string WatchdogProcessService::ClientInfo::toString() const {
1194     std::string buffer;
1195     StringAppendF(&buffer, "pid = %d, userId = %d, type = %s", kPid, kUserId,
1196                   kType == ClientType::Regular ? "regular" : "watchdog service");
1197     return buffer;
1198 }
1199 
getAIBinder() const1200 AIBinder* WatchdogProcessService::ClientInfo::getAIBinder() const {
1201     if (kType == ClientType::Regular) {
1202         return kClient->asBinder().get();
1203     }
1204     return kWatchdogServiceBinder.get();
1205 }
1206 
linkToDeath(AIBinder_DeathRecipient * recipient) const1207 ScopedAStatus WatchdogProcessService::ClientInfo::linkToDeath(
1208         AIBinder_DeathRecipient* recipient) const {
1209     if (kType == ClientType::Regular) {
1210         AIBinder* aiBinder = getAIBinder();
1211         return kService.mDeathRegistrationWrapper->linkToDeath(aiBinder, recipient,
1212                                                                static_cast<void*>(aiBinder));
1213     }
1214     // WatchdogServiceHelper is the binder death recipient for watchdog service, ergo
1215     // skip this step.
1216     return ScopedAStatus::ok();
1217 }
1218 
unlinkToDeath(AIBinder_DeathRecipient * recipient) const1219 ScopedAStatus WatchdogProcessService::ClientInfo::unlinkToDeath(
1220         AIBinder_DeathRecipient* recipient) const {
1221     if (kType == ClientType::Regular) {
1222         AIBinder* aiBinder = getAIBinder();
1223         return kService.mDeathRegistrationWrapper->unlinkToDeath(aiBinder, recipient,
1224                                                                  static_cast<void*>(aiBinder));
1225     }
1226     // WatchdogServiceHelper is the binder death recipient for watchdog service, ergo
1227     // skip this step.
1228     return ScopedAStatus::ok();
1229 }
1230 
checkIfAlive(TimeoutLength timeout) const1231 ScopedAStatus WatchdogProcessService::ClientInfo::checkIfAlive(TimeoutLength timeout) const {
1232     if (kType == ClientType::Regular) {
1233         return kClient->checkIfAlive(sessionId, timeout);
1234     }
1235     return kWatchdogServiceHelper->checkIfAlive(kWatchdogServiceBinder, sessionId, timeout);
1236 }
1237 
prepareProcessTermination() const1238 ScopedAStatus WatchdogProcessService::ClientInfo::prepareProcessTermination() const {
1239     if (kType == ClientType::Regular) {
1240         return kClient->prepareProcessTermination();
1241     }
1242     return kWatchdogServiceHelper->prepareProcessTermination(kWatchdogServiceBinder);
1243 }
1244 
onPropertyEvent(const std::vector<std::unique_ptr<IHalPropValue>> & propValues)1245 void WatchdogProcessService::PropertyChangeListener::onPropertyEvent(
1246         const std::vector<std::unique_ptr<IHalPropValue>>& propValues) {
1247     for (const auto& value : propValues) {
1248         if (value->getPropId() == static_cast<int32_t>(VehicleProperty::VHAL_HEARTBEAT)) {
1249             if (value->getInt64Values().size() < 1) {
1250                 ALOGE("Invalid VHAL_HEARTBEAT value, empty value");
1251             } else {
1252                 kService->updateVhalHeartBeat(value->getInt64Values()[0]);
1253             }
1254             break;
1255         }
1256     }
1257 }
1258 
onPropertySetError(const std::vector<HalPropError> & errors)1259 void WatchdogProcessService::PropertyChangeListener::onPropertySetError(
1260         const std::vector<HalPropError>& errors) {
1261     for (const auto& error : errors) {
1262         if (error.propId != static_cast<int32_t>(VehicleProperty::WATCHDOG_ALIVE) &&
1263             error.propId != static_cast<int32_t>(VehicleProperty::WATCHDOG_TERMINATED_PROCESS)) {
1264             continue;
1265         }
1266         ALOGE("failed to set VHAL property, prop ID: %d, status: %d", error.propId,
1267               static_cast<int32_t>(error.status));
1268     }
1269 }
1270 
handleMessage(const Message & message)1271 void WatchdogProcessService::MessageHandlerImpl::handleMessage(const Message& message) {
1272     switch (message.what) {
1273         case static_cast<int>(TimeoutLength::TIMEOUT_CRITICAL):
1274         case static_cast<int>(TimeoutLength::TIMEOUT_MODERATE):
1275         case static_cast<int>(TimeoutLength::TIMEOUT_NORMAL):
1276             kService->doHealthCheck(message.what);
1277             break;
1278         case MSG_VHAL_WATCHDOG_ALIVE:
1279             kService->reportWatchdogAliveToVhal();
1280             break;
1281         case MSG_VHAL_HEALTH_CHECK:
1282             kService->checkVhalHealth();
1283             break;
1284         case MSG_CACHE_VHAL_PROCESS_IDENTIFIER:
1285             kService->cacheVhalProcessIdentifier();
1286             break;
1287         default:
1288             ALOGW("Unknown message: %d", message.what);
1289     }
1290 }
1291 
1292 }  // namespace watchdog
1293 }  // namespace automotive
1294 }  // namespace android
1295