1 /**
2 * Copyright (c) 2020, The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #define LOG_TAG "carwatchdogd"
18 #define DEBUG false // STOPSHIP if true.
19
20 #include "WatchdogProcessService.h"
21
22 #include "ServiceManager.h"
23 #include "UidProcStatsCollector.h"
24 #include "WatchdogServiceHelper.h"
25
26 #include <aidl/android/hardware/automotive/vehicle/BnVehicle.h>
27 #include <aidl/android/hardware/automotive/vehicle/ProcessTerminationReason.h>
28 #include <android-base/file.h>
29 #include <android-base/macros.h>
30 #include <android-base/properties.h>
31 #include <android-base/stringprintf.h>
32 #include <android-base/strings.h>
33 #include <binder/IPCThreadState.h>
34 #include <hidl/HidlTransportSupport.h>
35 #include <utils/SystemClock.h>
36
37 #include <IVhalClient.h>
38 #include <VehicleHalTypes.h>
39 #include <inttypes.h>
40
41 #include <utility>
42
43 namespace android {
44 namespace automotive {
45 namespace watchdog {
46
47 using ::aidl::android::automotive::watchdog::ICarWatchdogClient;
48 using ::aidl::android::automotive::watchdog::TimeoutLength;
49 using ::aidl::android::automotive::watchdog::internal::ICarWatchdogMonitor;
50 using ::aidl::android::automotive::watchdog::internal::ICarWatchdogServiceForSystem;
51 using ::aidl::android::automotive::watchdog::internal::ProcessIdentifier;
52 using ::aidl::android::hardware::automotive::vehicle::BnVehicle;
53 using ::aidl::android::hardware::automotive::vehicle::ProcessTerminationReason;
54 using ::aidl::android::hardware::automotive::vehicle::StatusCode;
55 using ::aidl::android::hardware::automotive::vehicle::SubscribeOptions;
56 using ::aidl::android::hardware::automotive::vehicle::VehiclePropConfig;
57 using ::aidl::android::hardware::automotive::vehicle::VehicleProperty;
58 using ::aidl::android::hardware::automotive::vehicle::VehiclePropertyStatus;
59 using ::aidl::android::hardware::automotive::vehicle::VehiclePropValue;
60 using ::android::sp;
61 using ::android::String16;
62 using ::android::base::Error;
63 using ::android::base::GetIntProperty;
64 using ::android::base::GetProperty;
65 using ::android::base::ReadFileToString;
66 using ::android::base::Result;
67 using ::android::base::StringAppendF;
68 using ::android::base::StringPrintf;
69 using ::android::base::Trim;
70 using ::android::base::WriteStringToFd;
71 using ::android::binder::Status;
72 using ::android::frameworks::automotive::vhal::HalPropError;
73 using ::android::frameworks::automotive::vhal::IHalPropValue;
74 using ::android::frameworks::automotive::vhal::ISubscriptionClient;
75 using ::android::frameworks::automotive::vhal::IVhalClient;
76 using ::android::hardware::hidl_vec;
77 using ::android::hardware::interfacesEqual;
78 using ::android::hardware::Return;
79 using ::android::hidl::base::V1_0::IBase;
80 using ::android::hidl::manager::V1_0::IServiceManager;
81 using ::ndk::ScopedAIBinder_DeathRecipient;
82 using ::ndk::ScopedAStatus;
83 using ::ndk::SpAIBinder;
84
85 namespace {
86
87 const std::vector<TimeoutLength> kTimeouts = {TimeoutLength::TIMEOUT_CRITICAL,
88 TimeoutLength::TIMEOUT_MODERATE,
89 TimeoutLength::TIMEOUT_NORMAL};
90
91 // TimeoutLength is also used as a message ID. Other message IDs should start next to
92 // TimeoutLength::TIMEOUT_NORMAL.
93 const int32_t MSG_VHAL_WATCHDOG_ALIVE = static_cast<int>(TimeoutLength::TIMEOUT_NORMAL) + 1;
94 const int32_t MSG_VHAL_HEALTH_CHECK = MSG_VHAL_WATCHDOG_ALIVE + 1;
95 const int32_t MSG_CACHE_VHAL_PROCESS_IDENTIFIER = MSG_VHAL_HEALTH_CHECK + 1;
96
97 // VHAL is supposed to send heart beat every 3s. Car watchdog checks if there is the latest heart
98 // beat from VHAL within 3s, allowing 1s marginal time.
99 // If {@code ro.carwatchdog.vhal_healthcheck.interval} is set, car watchdog checks VHAL health at
100 // the given interval. The lower bound of the interval is 3s.
101 constexpr int32_t kDefaultVhalCheckIntervalSec = 3;
102 constexpr std::chrono::milliseconds kHealthCheckDelayMs = 1s;
103 constexpr int32_t kMaxVhalPidCachingAttempts = 2;
104 constexpr std::chrono::nanoseconds kDefaultVhalPidCachingRetryDelayNs = 30s;
105 constexpr TimeoutLength kCarWatchdogServiceTimeoutDelay = TimeoutLength::TIMEOUT_CRITICAL;
106 constexpr int32_t kMissingIntPropertyValue = -1;
107
108 constexpr const char kPropertyVhalCheckInterval[] = "ro.carwatchdog.vhal_healthcheck.interval";
109 constexpr const char kPropertyClientCheckInterval[] = "ro.carwatchdog.client_healthcheck.interval";
110 constexpr const char kServiceName[] = "WatchdogProcessService";
111 constexpr const char kHidlVhalInterfaceName[] = "android.hardware.automotive.vehicle@2.0::IVehicle";
112
113 const std::function<sp<IServiceManager>()> kDefaultTryGetHidlServiceManager =
__anon1c1bfe4b0202() 114 []() -> sp<IServiceManager> { return IServiceManager::tryGetService(/*getStub=*/false); };
115
116 enum RegistrationError {
117 ERR_ILLEGAL_STATE = 0,
118 ERR_DUPLICATE_REGISTRATION,
119 };
120
toScopedAStatus(Result<void> resultWithRegistrationError)121 ScopedAStatus toScopedAStatus(Result<void> resultWithRegistrationError) {
122 if (resultWithRegistrationError.ok()) {
123 return ScopedAStatus::ok();
124 }
125 if (resultWithRegistrationError.error().code() ==
126 RegistrationError::ERR_DUPLICATE_REGISTRATION) {
127 return ScopedAStatus::ok();
128 }
129 return ScopedAStatus::fromExceptionCodeWithMessage(EX_ILLEGAL_STATE,
130 resultWithRegistrationError.error()
131 .message()
132 .c_str());
133 }
134
toPidString(const std::vector<ProcessIdentifier> & processIdentifiers)135 std::string toPidString(const std::vector<ProcessIdentifier>& processIdentifiers) {
136 size_t size = processIdentifiers.size();
137 if (size == 0) {
138 return "";
139 }
140 std::string buffer;
141 StringAppendF(&buffer, "%d", processIdentifiers[0].pid);
142 for (size_t i = 1; i < size; i++) {
143 StringAppendF(&buffer, ", %d", processIdentifiers[i].pid);
144 }
145 return buffer;
146 }
147
isSystemShuttingDown()148 bool isSystemShuttingDown() {
149 std::string sysPowerCtl;
150 std::istringstream tokenStream(GetProperty("sys.powerctl", ""));
151 std::getline(tokenStream, sysPowerCtl, ',');
152 return sysPowerCtl == "reboot" || sysPowerCtl == "shutdown";
153 }
154
getStartTimeForPid(pid_t pid)155 int64_t getStartTimeForPid(pid_t pid) {
156 auto pidStat = UidProcStatsCollector::readStatFileForPid(pid);
157 if (!pidStat.ok()) {
158 return elapsedRealtime();
159 }
160 return pidStat->startTimeMillis;
161 }
162
onBinderDied(void * cookie)163 void onBinderDied(void* cookie) {
164 const auto& thiz = ServiceManager::getInstance()->getWatchdogProcessService();
165 if (thiz == nullptr) {
166 return;
167 }
168 thiz->handleBinderDeath(cookie);
169 }
queryHidlServiceManagerForVhalPid(const sp<IServiceManager> & hidlServiceManager)170 Result<pid_t> queryHidlServiceManagerForVhalPid(const sp<IServiceManager>& hidlServiceManager) {
171 pid_t pid = -1;
172 Return<void> ret = hidlServiceManager->debugDump([&](auto& hals) {
173 for (const auto& info : hals) {
174 if (info.pid == static_cast<int>(IServiceManager::PidConstant::NO_PID)) {
175 continue;
176 }
177 if (info.interfaceName == kHidlVhalInterfaceName) {
178 pid = info.pid;
179 return;
180 }
181 }
182 });
183
184 if (!ret.isOk()) {
185 return Error() << "Failed to get VHAL process id from HIDL service manager";
186 }
187 if (pid == -1) {
188 return Error() << "No VHAL service registered to HIDL service manager";
189 }
190 return pid;
191 }
192
193 } // namespace
194
WatchdogProcessService(const sp<Looper> & handlerLooper)195 WatchdogProcessService::WatchdogProcessService(const sp<Looper>& handlerLooper) :
196 WatchdogProcessService(IVhalClient::tryCreate, kDefaultTryGetHidlServiceManager,
197 getStartTimeForPid, kDefaultVhalPidCachingRetryDelayNs, handlerLooper,
198 sp<AIBinderDeathRegistrationWrapper>::make()) {}
199
WatchdogProcessService(const std::function<std::shared_ptr<IVhalClient> ()> & tryCreateVhalClientFunc,const std::function<sp<IServiceManager> ()> & tryGetHidlServiceManagerFunc,const std::function<int64_t (pid_t)> & getStartTimeForPidFunc,const std::chrono::nanoseconds & vhalPidCachingRetryDelayNs,const sp<Looper> & handlerLooper,const sp<AIBinderDeathRegistrationWrapperInterface> & deathRegistrationWrapper)200 WatchdogProcessService::WatchdogProcessService(
201 const std::function<std::shared_ptr<IVhalClient>()>& tryCreateVhalClientFunc,
202 const std::function<sp<IServiceManager>()>& tryGetHidlServiceManagerFunc,
203 const std::function<int64_t(pid_t)>& getStartTimeForPidFunc,
204 const std::chrono::nanoseconds& vhalPidCachingRetryDelayNs, const sp<Looper>& handlerLooper,
205 const sp<AIBinderDeathRegistrationWrapperInterface>& deathRegistrationWrapper) :
206 kTryCreateVhalClientFunc(tryCreateVhalClientFunc),
207 kTryGetHidlServiceManagerFunc(tryGetHidlServiceManagerFunc),
208 kGetStartTimeForPidFunc(getStartTimeForPidFunc),
209 kVhalPidCachingRetryDelayNs(vhalPidCachingRetryDelayNs),
210 mHandlerLooper(handlerLooper),
211 mClientBinderDeathRecipient(
212 ScopedAIBinder_DeathRecipient(AIBinder_DeathRecipient_new(onBinderDied))),
213 mLastSessionId(0),
214 mServiceStarted(false),
215 mDeathRegistrationWrapper(deathRegistrationWrapper),
216 mIsEnabled(true),
217 mVhalService(nullptr),
218 mTotalVhalPidCachingAttempts(0) {
219 mVhalBinderDiedCallback =
220 std::make_shared<IVhalClient::OnBinderDiedCallbackFunc>([this] { handleVhalDeath(); });
221 for (const auto& timeout : kTimeouts) {
222 mClientsByTimeout.insert(std::make_pair(timeout, ClientInfoMap()));
223 mPingedClients.insert(std::make_pair(timeout, PingedClientMap()));
224 }
225
226 int32_t vhalHealthCheckIntervalSec =
227 GetIntProperty(kPropertyVhalCheckInterval, kDefaultVhalCheckIntervalSec);
228 vhalHealthCheckIntervalSec = std::max(vhalHealthCheckIntervalSec, kDefaultVhalCheckIntervalSec);
229 mVhalHealthCheckWindowMs = std::chrono::seconds(vhalHealthCheckIntervalSec);
230
231 int32_t clientHealthCheckIntervalSec =
232 GetIntProperty(kPropertyClientCheckInterval, kMissingIntPropertyValue);
233 // Overridden timeout value must be greater than or equal to the maximum possible timeout value.
234 // Otherwise, clients will be pinged more frequently than the guaranteed timeout duration.
235 if (clientHealthCheckIntervalSec != kMissingIntPropertyValue) {
236 int32_t normalSec = std::chrono::duration_cast<std::chrono::seconds>(
237 getTimeoutDurationNs(TimeoutLength::TIMEOUT_NORMAL))
238 .count();
239 mOverriddenClientHealthCheckWindowNs = std::optional<std::chrono::seconds>{
240 std::max(clientHealthCheckIntervalSec, normalSec)};
241 }
242 }
243
registerClient(const std::shared_ptr<ICarWatchdogClient> & client,TimeoutLength timeout)244 ScopedAStatus WatchdogProcessService::registerClient(
245 const std::shared_ptr<ICarWatchdogClient>& client, TimeoutLength timeout) {
246 if (client == nullptr) {
247 return ScopedAStatus::fromExceptionCodeWithMessage(EX_ILLEGAL_ARGUMENT,
248 "Must provide non-null client");
249 }
250 pid_t callingPid = IPCThreadState::self()->getCallingPid();
251 uid_t callingUid = IPCThreadState::self()->getCallingUid();
252
253 ClientInfo clientInfo(client, callingPid, callingUid, kGetStartTimeForPidFunc(callingPid),
254 *this);
255 return toScopedAStatus(registerClient(clientInfo, timeout));
256 }
257
unregisterClient(const std::shared_ptr<ICarWatchdogClient> & client)258 ScopedAStatus WatchdogProcessService::unregisterClient(
259 const std::shared_ptr<ICarWatchdogClient>& client) {
260 if (client == nullptr) {
261 return ScopedAStatus::fromExceptionCodeWithMessage(EX_ILLEGAL_ARGUMENT,
262 "Must provide non-null client");
263 }
264 Mutex::Autolock lock(mMutex);
265 return unregisterClientLocked(kTimeouts, client->asBinder(), ClientType::Regular);
266 }
267
registerCarWatchdogService(const SpAIBinder & binder,const sp<WatchdogServiceHelperInterface> & helper)268 ScopedAStatus WatchdogProcessService::registerCarWatchdogService(
269 const SpAIBinder& binder, const sp<WatchdogServiceHelperInterface>& helper) {
270 pid_t callingPid = IPCThreadState::self()->getCallingPid();
271 uid_t callingUid = IPCThreadState::self()->getCallingUid();
272
273 if (helper == nullptr) {
274 return ScopedAStatus::
275 fromExceptionCodeWithMessage(EX_ILLEGAL_ARGUMENT,
276 "Watchdog service helper instance is null");
277 }
278 ClientInfo clientInfo(helper, binder, callingPid, callingUid,
279 kGetStartTimeForPidFunc(callingPid), *this);
280 if (auto result = registerClient(clientInfo, kCarWatchdogServiceTimeoutDelay); !result.ok()) {
281 return toScopedAStatus(result);
282 }
283 Mutex::Autolock lock(mMutex);
284 if (mNotSupportedVhalProperties.count(VehicleProperty::VHAL_HEARTBEAT) == 0 &&
285 mVhalService != nullptr && mVhalService->isAidlVhal() &&
286 !mVhalProcessIdentifier.has_value()) {
287 // When CarService is restarted in the middle handling the AIDL VHAL pid fetch request,
288 // the request will fail. Restart the caching process only when the AIDL VHAL pid is
289 // missing.
290 mTotalVhalPidCachingAttempts = 0;
291 mHandlerLooper->sendMessage(mMessageHandler, Message(MSG_CACHE_VHAL_PROCESS_IDENTIFIER));
292 }
293 return ScopedAStatus::ok();
294 }
295
unregisterCarWatchdogService(const SpAIBinder & binder)296 void WatchdogProcessService::unregisterCarWatchdogService(const SpAIBinder& binder) {
297 Mutex::Autolock lock(mMutex);
298
299 std::vector<TimeoutLength> timeouts = {TimeoutLength::TIMEOUT_CRITICAL};
300 unregisterClientLocked(timeouts, binder, ClientType::Service);
301 }
302
registerMonitor(const std::shared_ptr<ICarWatchdogMonitor> & monitor)303 ScopedAStatus WatchdogProcessService::registerMonitor(
304 const std::shared_ptr<ICarWatchdogMonitor>& monitor) {
305 if (monitor == nullptr) {
306 return ScopedAStatus::fromExceptionCodeWithMessage(EX_ILLEGAL_ARGUMENT,
307 "Must provide non-null monitor");
308 }
309 const auto binder = monitor->asBinder();
310 {
311 Mutex::Autolock lock(mMutex);
312 if (mMonitor != nullptr) {
313 if (mMonitor->asBinder() == binder) {
314 return ScopedAStatus::ok();
315 }
316 AIBinder* aiBinder = mMonitor->asBinder().get();
317 mDeathRegistrationWrapper->unlinkToDeath(aiBinder, mClientBinderDeathRecipient.get(),
318 static_cast<void*>(aiBinder));
319 }
320 mMonitor = monitor;
321 }
322
323 AIBinder* aiBinder = binder.get();
324 auto status =
325 mDeathRegistrationWrapper->linkToDeath(aiBinder, mClientBinderDeathRecipient.get(),
326 static_cast<void*>(aiBinder));
327 if (!status.isOk()) {
328 {
329 Mutex::Autolock lock(mMutex);
330 if (mMonitor != nullptr && mMonitor->asBinder() == binder) {
331 mMonitor.reset();
332 }
333 }
334 ALOGW("Failed to register the monitor as it is dead.");
335 return ScopedAStatus::fromExceptionCodeWithMessage(EX_ILLEGAL_STATE,
336 "The monitor is dead.");
337 }
338 if (DEBUG) {
339 ALOGD("Car watchdog monitor is registered");
340 }
341 return ScopedAStatus::ok();
342 }
343
unregisterMonitor(const std::shared_ptr<ICarWatchdogMonitor> & monitor)344 ScopedAStatus WatchdogProcessService::unregisterMonitor(
345 const std::shared_ptr<ICarWatchdogMonitor>& monitor) {
346 if (monitor == nullptr) {
347 return ScopedAStatus::fromExceptionCodeWithMessage(EX_ILLEGAL_ARGUMENT,
348 "Must provide non-null monitor");
349 }
350 const auto binder = monitor->asBinder();
351 Mutex::Autolock lock(mMutex);
352 if (mMonitor == nullptr || mMonitor->asBinder() != binder) {
353 ALOGW("Failed to unregister the monitor as it has not been registered.");
354 return ScopedAStatus::fromExceptionCodeWithMessage(EX_ILLEGAL_ARGUMENT,
355 "The monitor has not been registered.");
356 }
357 AIBinder* aiBinder = binder.get();
358 mDeathRegistrationWrapper->unlinkToDeath(aiBinder, mClientBinderDeathRecipient.get(),
359 static_cast<void*>(aiBinder));
360 mMonitor.reset();
361 if (DEBUG) {
362 ALOGD("Car watchdog monitor is unregistered");
363 }
364 return ScopedAStatus::ok();
365 }
366
tellClientAlive(const std::shared_ptr<ICarWatchdogClient> & client,int32_t sessionId)367 ScopedAStatus WatchdogProcessService::tellClientAlive(
368 const std::shared_ptr<ICarWatchdogClient>& client, int32_t sessionId) {
369 if (client == nullptr) {
370 return ScopedAStatus::fromExceptionCodeWithMessage(EX_ILLEGAL_ARGUMENT,
371 "Must provide non-null client");
372 }
373 Mutex::Autolock lock(mMutex);
374 return tellClientAliveLocked(client->asBinder(), sessionId);
375 }
376
tellCarWatchdogServiceAlive(const std::shared_ptr<ICarWatchdogServiceForSystem> & service,const std::vector<ProcessIdentifier> & clientsNotResponding,int32_t sessionId)377 ScopedAStatus WatchdogProcessService::tellCarWatchdogServiceAlive(
378 const std::shared_ptr<ICarWatchdogServiceForSystem>& service,
379 const std::vector<ProcessIdentifier>& clientsNotResponding, int32_t sessionId) {
380 if (service == nullptr) {
381 return ScopedAStatus::
382 fromExceptionCodeWithMessage(EX_ILLEGAL_ARGUMENT,
383 "Must provide non-null car watchdog service");
384 }
385 ScopedAStatus status;
386 {
387 Mutex::Autolock lock(mMutex);
388 if (DEBUG) {
389 if (clientsNotResponding.size() > 0) {
390 ALOGD("CarWatchdogService(session: %d) responded with non-responding clients: %s",
391 sessionId, toPidString(clientsNotResponding).c_str());
392 }
393 }
394 status = tellClientAliveLocked(service->asBinder(), sessionId);
395 }
396 if (status.isOk()) {
397 dumpAndKillAllProcesses(clientsNotResponding, /*reportToVhal=*/true);
398 }
399 return status;
400 }
401
tellDumpFinished(const std::shared_ptr<ICarWatchdogMonitor> & monitor,const ProcessIdentifier & processIdentifier)402 ScopedAStatus WatchdogProcessService::tellDumpFinished(
403 const std::shared_ptr<ICarWatchdogMonitor>& monitor,
404 const ProcessIdentifier& processIdentifier) {
405 Mutex::Autolock lock(mMutex);
406 if (mMonitor == nullptr || monitor == nullptr || mMonitor->asBinder() != monitor->asBinder()) {
407 return ScopedAStatus::fromExceptionCodeWithMessage(EX_ILLEGAL_ARGUMENT,
408 "The monitor is not registered or an "
409 "invalid monitor is given");
410 }
411 ALOGI("Process(pid: %d) has been dumped and killed", processIdentifier.pid);
412 return ScopedAStatus::ok();
413 }
414
setEnabled(bool isEnabled)415 void WatchdogProcessService::setEnabled(bool isEnabled) {
416 Mutex::Autolock lock(mMutex);
417 if (mIsEnabled == isEnabled) {
418 return;
419 }
420 ALOGI("%s is %s", kServiceName, isEnabled ? "enabled" : "disabled");
421 mIsEnabled = isEnabled;
422 mHandlerLooper->removeMessages(mMessageHandler, MSG_VHAL_HEALTH_CHECK);
423 if (!mIsEnabled) {
424 return;
425 }
426 if (mNotSupportedVhalProperties.count(VehicleProperty::VHAL_HEARTBEAT) == 0) {
427 mVhalHeartBeat.eventTime = uptimeMillis();
428 std::chrono::nanoseconds intervalNs = mVhalHealthCheckWindowMs + kHealthCheckDelayMs;
429 mHandlerLooper->sendMessageDelayed(intervalNs.count(), mMessageHandler,
430 Message(MSG_VHAL_HEALTH_CHECK));
431 }
432 for (const auto& timeout : kTimeouts) {
433 mHandlerLooper->removeMessages(mMessageHandler, static_cast<int>(timeout));
434 startHealthCheckingLocked(timeout);
435 }
436 }
437
onUserStateChange(userid_t userId,bool isStarted)438 void WatchdogProcessService::onUserStateChange(userid_t userId, bool isStarted) {
439 std::string buffer;
440 Mutex::Autolock lock(mMutex);
441 if (isStarted) {
442 mStoppedUserIds.erase(userId);
443 } else {
444 mStoppedUserIds.insert(userId);
445 }
446 }
447
onDump(int fd)448 void WatchdogProcessService::onDump(int fd) {
449 Mutex::Autolock lock(mMutex);
450 const char* indent = " ";
451 const char* doubleIndent = " ";
452 std::string buffer;
453 WriteStringToFd("CAR WATCHDOG PROCESS SERVICE\n", fd);
454 WriteStringToFd(StringPrintf("%s%s enabled: %s\n", indent, kServiceName,
455 mIsEnabled ? "true" : "false"),
456 fd);
457 WriteStringToFd(StringPrintf("%sRegistered clients\n", indent), fd);
458 int count = 1;
459 for (const auto& timeout : kTimeouts) {
460 ClientInfoMap& clients = mClientsByTimeout[timeout];
461 for (auto it = clients.begin(); it != clients.end(); it++, count++) {
462 WriteStringToFd(StringPrintf("%sClient #%d: %s\n", doubleIndent, count,
463 it->second.toString().c_str()),
464 fd);
465 }
466 }
467 WriteStringToFd(StringPrintf("%sMonitor registered: %s\n", indent,
468 mMonitor == nullptr ? "false" : "true"),
469 fd);
470 WriteStringToFd(StringPrintf("%sisSystemShuttingDown: %s\n", indent,
471 isSystemShuttingDown() ? "true" : "false"),
472 fd);
473 buffer = "none";
474 bool first = true;
475 for (const auto& userId : mStoppedUserIds) {
476 if (first) {
477 buffer = StringPrintf("%d", userId);
478 first = false;
479 } else {
480 StringAppendF(&buffer, ", %d", userId);
481 }
482 }
483 WriteStringToFd(StringPrintf("%sStopped users: %s\n", indent, buffer.c_str()), fd);
484 if (mVhalService != nullptr &&
485 mNotSupportedVhalProperties.count(VehicleProperty::VHAL_HEARTBEAT) == 0) {
486 int64_t systemUptime = uptimeMillis();
487 WriteStringToFd(StringPrintf("%sVHAL health check is supported:\n%s\tVHAL health check "
488 "interval: %lld ms\n%s\tVHAL heartbeat was updated %" PRIi64
489 " ms ago",
490 indent, indent, mVhalHealthCheckWindowMs.count(), indent,
491 systemUptime - mVhalHeartBeat.eventTime),
492 fd);
493 std::string vhalType = mVhalService->isAidlVhal() ? "AIDL" : "HIDL";
494 if (mVhalProcessIdentifier.has_value()) {
495 WriteStringToFd(StringPrintf("%s%s VHAL process identifier (PID = %d, Start time "
496 "millis = "
497 "%" PRIi64 ")",
498 indent, vhalType.c_str(), mVhalProcessIdentifier->pid,
499 mVhalProcessIdentifier->startTimeMillis),
500 fd);
501 } else if (mTotalVhalPidCachingAttempts < kMaxVhalPidCachingAttempts) {
502 WriteStringToFd(StringPrintf("%sStill fetching %s VHAL process identifier. "
503 "Total attempts made = %d, Remaining attempts = %d",
504 indent, vhalType.c_str(), mTotalVhalPidCachingAttempts,
505 kMaxVhalPidCachingAttempts - mTotalVhalPidCachingAttempts),
506 fd);
507 } else {
508 WriteStringToFd(StringPrintf("%sFailed to fetch %s VHAL process identifier. "
509 "Cannot terminate VHAL when VHAL becomes unresponsive",
510 indent, vhalType.c_str()),
511 fd);
512 }
513 } else if (mVhalService != nullptr) {
514 WriteStringToFd(StringPrintf("%sVHAL client is connected but the heartbeat property is not "
515 "supported",
516 indent),
517 fd);
518 } else {
519 WriteStringToFd(StringPrintf("%sVHAL client is not connected", indent), fd);
520 }
521 }
522
doHealthCheck(int what)523 void WatchdogProcessService::doHealthCheck(int what) {
524 mHandlerLooper->removeMessages(mMessageHandler, what);
525 if (Mutex::Autolock lock(mMutex); !mIsEnabled) {
526 return;
527 }
528 const TimeoutLength timeout = static_cast<TimeoutLength>(what);
529 dumpAndKillClientsIfNotResponding(timeout);
530
531 /* Generates a temporary/local vector containing clients.
532 * Using a local copy may send unnecessary ping messages to clients after they are unregistered.
533 * Clients should be able to handle them.
534 */
535 std::vector<ClientInfo> clientsToCheck;
536 PingedClientMap* pingedClients = nullptr;
537 {
538 Mutex::Autolock lock(mMutex);
539 pingedClients = &mPingedClients[timeout];
540 pingedClients->clear();
541 for (auto& [_, clientInfo] : mClientsByTimeout[timeout]) {
542 if (mStoppedUserIds.count(clientInfo.kUserId) > 0) {
543 continue;
544 }
545 int sessionId = getNewSessionId();
546 clientInfo.sessionId = sessionId;
547 clientsToCheck.push_back(clientInfo);
548 pingedClients->insert(std::make_pair(sessionId, clientInfo));
549 }
550 }
551
552 for (const auto& clientInfo : clientsToCheck) {
553 if (auto status = clientInfo.checkIfAlive(timeout); !status.isOk()) {
554 if (DEBUG) {
555 ALOGW("Failed to send a ping message to client(pid: %d): %s", clientInfo.kPid,
556 status.getMessage());
557 }
558 {
559 Mutex::Autolock lock(mMutex);
560 pingedClients->erase(clientInfo.sessionId);
561 }
562 }
563 }
564 // Though the size of pingedClients is a more specific measure, clientsToCheck is used as a
565 // conservative approach.
566 if (clientsToCheck.size() > 0) {
567 auto durationNs = getTimeoutDurationNs(timeout);
568 mHandlerLooper->sendMessageDelayed(durationNs.count(), mMessageHandler, Message(what));
569 }
570 }
571
start()572 Result<void> WatchdogProcessService::start() {
573 if (mServiceStarted) {
574 return Error(INVALID_OPERATION) << "Cannot start process monitoring more than once";
575 }
576 auto thiz = sp<WatchdogProcessService>::fromExisting(this);
577 mMessageHandler = sp<MessageHandlerImpl>::make(thiz);
578 mPropertyChangeListener = std::make_shared<PropertyChangeListener>(thiz);
579 mServiceStarted = true;
580 reportWatchdogAliveToVhal();
581 return {};
582 }
583
terminate()584 void WatchdogProcessService::terminate() {
585 std::unique_ptr<ISubscriptionClient> propertySubscriptionClient;
586 {
587 Mutex::Autolock lock(mMutex);
588 if (!mServiceStarted) {
589 return;
590 }
591 for (auto& [_, clients] : mClientsByTimeout) {
592 for (auto& [_, client] : clients) {
593 client.unlinkToDeath(mClientBinderDeathRecipient.get());
594 }
595 clients.clear();
596 }
597 mClientsByTimeout.clear();
598 if (mMonitor != nullptr) {
599 AIBinder* aiBinder = mMonitor->asBinder().get();
600 mDeathRegistrationWrapper->unlinkToDeath(aiBinder, mClientBinderDeathRecipient.get(),
601 static_cast<void*>(aiBinder));
602 mMonitor.reset();
603 }
604 mHandlerLooper->removeMessages(mMessageHandler, MSG_VHAL_HEALTH_CHECK);
605 mServiceStarted = false;
606 if (mVhalService == nullptr) {
607 return;
608 }
609 if (mNotSupportedVhalProperties.count(VehicleProperty::VHAL_HEARTBEAT) == 0) {
610 propertySubscriptionClient =
611 mVhalService->getSubscriptionClient(mPropertyChangeListener);
612 }
613 mVhalService->removeOnBinderDiedCallback(mVhalBinderDiedCallback);
614 resetVhalInfoLocked();
615 }
616 if (propertySubscriptionClient != nullptr) {
617 std::vector<int32_t> propIds = {static_cast<int32_t>(VehicleProperty::VHAL_HEARTBEAT)};
618 auto result = propertySubscriptionClient->unsubscribe(propIds);
619 if (!result.ok()) {
620 ALOGW("Failed to unsubscribe from VHAL_HEARTBEAT.");
621 }
622 }
623 }
624
registerClient(const ClientInfo & clientInfo,TimeoutLength timeout)625 Result<void> WatchdogProcessService::registerClient(const ClientInfo& clientInfo,
626 TimeoutLength timeout) {
627 uintptr_t cookieId = reinterpret_cast<uintptr_t>(clientInfo.getAIBinder());
628 {
629 Mutex::Autolock lock(mMutex);
630 if (findClientAndProcessLocked(kTimeouts, clientInfo.getAIBinder(), nullptr)) {
631 return Error(RegistrationError::ERR_DUPLICATE_REGISTRATION)
632 << "Failed to register (" << clientInfo.toString()
633 << ") as it is already registered";
634 }
635
636 ClientInfoMap& clients = mClientsByTimeout[timeout];
637 clients.insert(std::make_pair(cookieId, clientInfo));
638 }
639 if (auto status = clientInfo.linkToDeath(mClientBinderDeathRecipient.get()); !status.isOk()) {
640 Mutex::Autolock lock(mMutex);
641 if (auto it = mClientsByTimeout.find(timeout); it != mClientsByTimeout.end()) {
642 if (const auto& clientIt = it->second.find(cookieId); clientIt != it->second.end()) {
643 it->second.erase(clientIt);
644 }
645 }
646 return Error(RegistrationError::ERR_ILLEGAL_STATE)
647 << "Failed to register (" << clientInfo.toString() << ") as it is dead";
648 }
649 if (DEBUG) {
650 ALOGD("Car watchdog client (%s, timeout = %d) is registered", clientInfo.toString().c_str(),
651 timeout);
652 }
653 Mutex::Autolock lock(mMutex);
654 // If the client array becomes non-empty, start health checking.
655 if (mClientsByTimeout[timeout].size() == 1) {
656 startHealthCheckingLocked(timeout);
657 ALOGI("Starting health checking for timeout = %d", timeout);
658 }
659 return {};
660 }
661
unregisterClientLocked(const std::vector<TimeoutLength> & timeouts,const SpAIBinder & binder,ClientType clientType)662 ScopedAStatus WatchdogProcessService::unregisterClientLocked(
663 const std::vector<TimeoutLength>& timeouts, const SpAIBinder& binder,
664 ClientType clientType) {
665 const char* clientName = clientType == ClientType::Regular ? "client" : "service";
666 bool result = findClientAndProcessLocked(timeouts, binder.get(),
667 [&](ClientInfoMap& clients,
668 ClientInfoMap::const_iterator it) {
669 it->second.unlinkToDeath(
670 mClientBinderDeathRecipient.get());
671 clients.erase(it);
672 });
673 if (!result) {
674 std::string errorStr =
675 StringPrintf("The car watchdog %s has not been registered", clientName);
676 const char* errorCause = errorStr.c_str();
677 ALOGW("Failed to unregister the car watchdog %s: %s", clientName, errorCause);
678 return ScopedAStatus::fromExceptionCodeWithMessage(EX_ILLEGAL_ARGUMENT, errorCause);
679 }
680 if (DEBUG) {
681 ALOGD("Car watchdog %s is unregistered", clientName);
682 }
683 return ScopedAStatus::ok();
684 }
685
tellClientAliveLocked(const SpAIBinder & binder,int32_t sessionId)686 ScopedAStatus WatchdogProcessService::tellClientAliveLocked(const SpAIBinder& binder,
687 int32_t sessionId) {
688 for (const auto& timeout : kTimeouts) {
689 PingedClientMap& clients = mPingedClients[timeout];
690 PingedClientMap::const_iterator it = clients.find(sessionId);
691 if (it == clients.cend() || it->second.getAIBinder() != binder.get()) {
692 continue;
693 }
694 clients.erase(it);
695 return ScopedAStatus::ok();
696 }
697 return ScopedAStatus::fromExceptionCodeWithMessage(EX_ILLEGAL_ARGUMENT,
698 "The client is not registered or the "
699 "session ID is not found");
700 }
701
findClientAndProcessLocked(const std::vector<TimeoutLength> & timeouts,AIBinder * aiBinder,const Processor & processor)702 bool WatchdogProcessService::findClientAndProcessLocked(const std::vector<TimeoutLength>& timeouts,
703 AIBinder* aiBinder,
704 const Processor& processor) {
705 return findClientAndProcessLocked(timeouts, reinterpret_cast<uintptr_t>(aiBinder), processor);
706 }
707
findClientAndProcessLocked(const std::vector<TimeoutLength> & timeouts,uintptr_t binderPtrId,const Processor & processor)708 bool WatchdogProcessService::findClientAndProcessLocked(const std::vector<TimeoutLength>& timeouts,
709 uintptr_t binderPtrId,
710 const Processor& processor) {
711 for (const auto& timeout : timeouts) {
712 auto clientsByIdIt = mClientsByTimeout.find(timeout);
713 if (clientsByIdIt == mClientsByTimeout.end()) {
714 continue;
715 }
716 auto it = clientsByIdIt->second.find(binderPtrId);
717 if (it == clientsByIdIt->second.end()) {
718 continue;
719 }
720 if (processor != nullptr) {
721 processor(clientsByIdIt->second, it);
722 }
723 return true;
724 }
725
726 return false;
727 }
728
startHealthCheckingLocked(TimeoutLength timeout)729 Result<void> WatchdogProcessService::startHealthCheckingLocked(TimeoutLength timeout) {
730 PingedClientMap& clients = mPingedClients[timeout];
731 clients.clear();
732 int what = static_cast<int>(timeout);
733 auto durationNs = getTimeoutDurationNs(timeout);
734 mHandlerLooper->sendMessageDelayed(durationNs.count(), mMessageHandler, Message(what));
735 return {};
736 }
737
dumpAndKillClientsIfNotResponding(TimeoutLength timeout)738 Result<void> WatchdogProcessService::dumpAndKillClientsIfNotResponding(TimeoutLength timeout) {
739 std::vector<ProcessIdentifier> processIdentifiers;
740 std::vector<const ClientInfo*> clientsToNotify;
741 {
742 Mutex::Autolock lock(mMutex);
743 PingedClientMap& clients = mPingedClients[timeout];
744 for (PingedClientMap::const_iterator it = clients.cbegin(); it != clients.cend(); it++) {
745 pid_t pid = -1;
746 userid_t userId = -1;
747 uint64_t startTimeMillis = 0;
748 std::vector<TimeoutLength> timeouts = {timeout};
749 findClientAndProcessLocked(timeouts, it->second.getAIBinder(),
750 [&](ClientInfoMap& cachedClients,
751 ClientInfoMap::const_iterator cachedClientsIt) {
752 auto clientInfo = cachedClientsIt->second;
753 pid = clientInfo.kPid;
754 startTimeMillis = clientInfo.kStartTimeMillis;
755 userId = clientInfo.kUserId;
756 clientInfo.unlinkToDeath(
757 mClientBinderDeathRecipient.get());
758 cachedClients.erase(cachedClientsIt);
759 });
760 if (pid != -1 && mStoppedUserIds.count(userId) == 0) {
761 clientsToNotify.emplace_back(&it->second);
762 ProcessIdentifier processIdentifier;
763 processIdentifier.pid = pid;
764 processIdentifier.startTimeMillis = startTimeMillis;
765 processIdentifiers.push_back(processIdentifier);
766 }
767 }
768 }
769 for (const ClientInfo*& clientInfo : clientsToNotify) {
770 clientInfo->prepareProcessTermination();
771 }
772 return dumpAndKillAllProcesses(processIdentifiers, /*reportToVhal=*/true);
773 }
774
dumpAndKillAllProcesses(const std::vector<ProcessIdentifier> & processesNotResponding,bool reportToVhal)775 Result<void> WatchdogProcessService::dumpAndKillAllProcesses(
776 const std::vector<ProcessIdentifier>& processesNotResponding, bool reportToVhal) {
777 size_t size = processesNotResponding.size();
778 if (size == 0) {
779 return {};
780 }
781 std::string pidString = toPidString(processesNotResponding);
782 std::shared_ptr<ICarWatchdogMonitor> monitor;
783 {
784 Mutex::Autolock lock(mMutex);
785 if (mMonitor == nullptr) {
786 std::string errorMsg =
787 StringPrintf("Failed to dump and kill processes(pid = %s): Monitor is not set",
788 pidString.c_str());
789 ALOGW("%s", errorMsg.c_str());
790 return Error() << errorMsg;
791 }
792 monitor = mMonitor;
793 }
794 if (isSystemShuttingDown()) {
795 ALOGI("Skip dumping and killing processes(%s): The system is shutting down",
796 pidString.c_str());
797 return {};
798 }
799 if (reportToVhal) {
800 reportTerminatedProcessToVhal(processesNotResponding);
801 }
802 monitor->onClientsNotResponding(processesNotResponding);
803 if (DEBUG) {
804 ALOGD("Dumping and killing processes is requested: %s", pidString.c_str());
805 }
806 return {};
807 }
808
809 // Handle when car watchdog clients die.
handleBinderDeath(void * cookie)810 void WatchdogProcessService::handleBinderDeath(void* cookie) {
811 uintptr_t cookieId = reinterpret_cast<uintptr_t>(cookie);
812
813 // The same binder death recipient is used for both monitor and client deaths. So, check both
814 // the monitor and all the clients until a match is found.
815 Mutex::Autolock lock(mMutex);
816 if (mMonitor != nullptr) {
817 if (AIBinder* aiBinder = mMonitor->asBinder().get();
818 reinterpret_cast<uintptr_t>(aiBinder) == cookieId) {
819 mMonitor.reset();
820 ALOGW("The monitor has died.");
821 return;
822 }
823 }
824
825 findClientAndProcessLocked(kTimeouts, cookieId,
826 [&](ClientInfoMap& clients, ClientInfoMap::const_iterator it) {
827 ALOGW("Client(pid: %d) died", it->second.kPid);
828 clients.erase(it);
829 });
830 }
831
832 // Handle when VHAL dies.
handleVhalDeath()833 void WatchdogProcessService::handleVhalDeath() {
834 Mutex::Autolock lock(mMutex);
835 ALOGW("VHAL has died.");
836 mHandlerLooper->removeMessages(mMessageHandler, MSG_VHAL_HEALTH_CHECK);
837 // Destroying mVHalService would remove all onBinderDied callbacks.
838 resetVhalInfoLocked();
839 }
840
reportWatchdogAliveToVhal()841 void WatchdogProcessService::reportWatchdogAliveToVhal() {
842 if (mNotSupportedVhalProperties.count(VehicleProperty::WATCHDOG_ALIVE) > 0) {
843 ALOGW("VHAL doesn't support WATCHDOG_ALIVE. Car watchdog will not update WATCHDOG_ALIVE.");
844 return;
845 }
846 int64_t systemUptime = uptimeMillis();
847 VehiclePropValue propValue{
848 .prop = static_cast<int32_t>(VehicleProperty::WATCHDOG_ALIVE),
849 .value.int64Values = {systemUptime},
850 };
851 const auto& ret = updateVhal(propValue);
852 if (!ret.ok()) {
853 ALOGW("Failed to update WATCHDOG_ALIVE VHAL property. Will try again in 3s, error: %s",
854 ret.error().message().c_str());
855 }
856 // Update VHAL with the interval of TIMEOUT_CRITICAL(3s).
857 auto durationNs = getTimeoutDurationNs(TimeoutLength::TIMEOUT_CRITICAL);
858 mHandlerLooper->removeMessages(mMessageHandler, MSG_VHAL_WATCHDOG_ALIVE);
859 mHandlerLooper->sendMessageDelayed(durationNs.count(), mMessageHandler,
860 Message(MSG_VHAL_WATCHDOG_ALIVE));
861 }
862
reportTerminatedProcessToVhal(const std::vector<ProcessIdentifier> & processesNotResponding)863 void WatchdogProcessService::reportTerminatedProcessToVhal(
864 const std::vector<ProcessIdentifier>& processesNotResponding) {
865 if (mNotSupportedVhalProperties.count(VehicleProperty::WATCHDOG_TERMINATED_PROCESS) > 0) {
866 ALOGW("VHAL doesn't support WATCHDOG_TERMINATED_PROCESS. Terminated process is not "
867 "reported to VHAL.");
868 return;
869 }
870 for (auto&& processIdentifier : processesNotResponding) {
871 const auto& retCmdLine = readProcCmdLine(processIdentifier.pid);
872 if (!retCmdLine.ok()) {
873 ALOGW("Failed to get process command line for pid(%d): %s", processIdentifier.pid,
874 retCmdLine.error().message().c_str());
875 continue;
876 }
877 std::string procCmdLine = retCmdLine.value();
878 VehiclePropValue propValue{
879 .prop = static_cast<int32_t>(VehicleProperty::WATCHDOG_TERMINATED_PROCESS),
880 .value.int32Values = {static_cast<int32_t>(
881 ProcessTerminationReason::NOT_RESPONDING)},
882 .value.stringValue = procCmdLine,
883 };
884 const auto& retUpdate = updateVhal(propValue);
885 if (!retUpdate.ok()) {
886 ALOGW("Failed to update WATCHDOG_TERMINATED_PROCESS VHAL property(command line: %s)",
887 procCmdLine.c_str());
888 }
889 }
890 }
891
updateVhal(const VehiclePropValue & value)892 Result<void> WatchdogProcessService::updateVhal(const VehiclePropValue& value) {
893 const auto& connectRet = connectToVhal();
894 if (!connectRet.ok()) {
895 std::string errorMsg = "VHAL is not connected: " + connectRet.error().message();
896 ALOGW("%s", errorMsg.c_str());
897 return Error() << errorMsg;
898 }
899 int32_t propId = value.prop;
900 std::shared_ptr<IVhalClient> vhalService;
901 {
902 Mutex::Autolock lock(mMutex);
903 if (mNotSupportedVhalProperties.count(static_cast<VehicleProperty>(propId)) > 0) {
904 std::string errorMsg = StringPrintf("VHAL doesn't support property(id: %d)", propId);
905 ALOGW("%s", errorMsg.c_str());
906 return Error() << errorMsg;
907 }
908 vhalService = mVhalService;
909 }
910
911 auto halPropValue = vhalService->createHalPropValue(propId);
912 halPropValue->setInt32Values(value.value.int32Values);
913 halPropValue->setInt64Values(value.value.int64Values);
914 halPropValue->setStringValue(value.value.stringValue);
915 if (auto result = vhalService->setValueSync(*halPropValue); !result.ok()) {
916 return Error() << "Failed to set propValue(" << propId
917 << ") to VHAL, error: " << result.error().message();
918 }
919
920 return {};
921 }
922
readProcCmdLine(int32_t pid)923 Result<std::string> WatchdogProcessService::readProcCmdLine(int32_t pid) {
924 std::string cmdLinePath = StringPrintf("/proc/%d/cmdline", pid);
925 std::string procCmdLine;
926 if (ReadFileToString(cmdLinePath, &procCmdLine)) {
927 std::replace(procCmdLine.begin(), procCmdLine.end(), '\0', ' ');
928 procCmdLine = Trim(procCmdLine);
929 return procCmdLine;
930 }
931 return Error() << "Failed to read " << cmdLinePath;
932 }
933
connectToVhal()934 Result<void> WatchdogProcessService::connectToVhal() {
935 {
936 Mutex::Autolock lock(mMutex);
937 if (mVhalService != nullptr) {
938 return {};
939 }
940 mVhalService = kTryCreateVhalClientFunc();
941 if (mVhalService == nullptr) {
942 return Error() << "Failed to connect to VHAL.";
943 }
944 mVhalService->addOnBinderDiedCallback(mVhalBinderDiedCallback);
945 }
946 queryVhalProperties();
947 subscribeToVhalHeartBeat();
948 ALOGI("Successfully connected to VHAL.");
949 return {};
950 }
951
queryVhalProperties()952 void WatchdogProcessService::queryVhalProperties() {
953 std::shared_ptr<IVhalClient> vhalService;
954 {
955 Mutex::Autolock lock(mMutex);
956 vhalService = mVhalService;
957 }
958 std::unordered_set<VehicleProperty> notSupportedProperties;
959 std::vector<VehicleProperty> propIds = {VehicleProperty::WATCHDOG_ALIVE,
960 VehicleProperty::WATCHDOG_TERMINATED_PROCESS,
961 VehicleProperty::VHAL_HEARTBEAT};
962 for (const auto& propId : propIds) {
963 if (auto result = vhalService->getPropConfigs({static_cast<int32_t>(propId)});
964 !result.ok()) {
965 notSupportedProperties.insert(propId);
966 }
967 }
968 {
969 Mutex::Autolock lock(mMutex);
970 mNotSupportedVhalProperties = std::move(notSupportedProperties);
971 }
972 }
973
subscribeToVhalHeartBeat()974 void WatchdogProcessService::subscribeToVhalHeartBeat() {
975 std::unique_ptr<ISubscriptionClient> propertySubscriptionClient;
976 {
977 Mutex::Autolock lock(mMutex);
978 if (mNotSupportedVhalProperties.count(VehicleProperty::VHAL_HEARTBEAT) > 0) {
979 ALOGW("VHAL doesn't support VHAL_HEARTBEAT. Checking VHAL health is disabled.");
980 return;
981 }
982
983 mVhalHeartBeat = {
984 .eventTime = 0,
985 .value = 0,
986 };
987 propertySubscriptionClient = mVhalService->getSubscriptionClient(mPropertyChangeListener);
988 }
989 std::vector<SubscribeOptions> options = {
990 {.propId = static_cast<int32_t>(VehicleProperty::VHAL_HEARTBEAT), .areaIds = {}},
991 };
992 if (auto result = propertySubscriptionClient->subscribe(options); !result.ok()) {
993 ALOGW("Failed to subscribe to VHAL_HEARTBEAT. Checking VHAL health is disabled. '%s'",
994 result.error().message().c_str());
995 return;
996 }
997 std::chrono::nanoseconds intervalNs = mVhalHealthCheckWindowMs + kHealthCheckDelayMs;
998 mHandlerLooper->sendMessageDelayed(intervalNs.count(), mMessageHandler,
999 Message(MSG_VHAL_HEALTH_CHECK));
1000 // VHAL process identifier is required only when terminating the VHAL process. VHAL process is
1001 // terminated only when the VHAL is unhealthy. However, caching the process identifier as soon
1002 // as connecting to VHAL guarantees the correct PID is cached. Because the VHAL pid is queried
1003 // from the service manager, the caching should be performed outside the class level lock. So,
1004 // handle the caching in the handler thread after successfully subscribing to the VHAL_HEARTBEAT
1005 // property.
1006 mHandlerLooper->sendMessage(mMessageHandler, Message(MSG_CACHE_VHAL_PROCESS_IDENTIFIER));
1007 return;
1008 }
1009
getWatchdogServiceHelperLocked()1010 const sp<WatchdogServiceHelperInterface> WatchdogProcessService::getWatchdogServiceHelperLocked() {
1011 ClientInfoMap& clients = mClientsByTimeout[kCarWatchdogServiceTimeoutDelay];
1012 for (const auto& [_, clientInfo] : clients) {
1013 if (clientInfo.kType == ClientType::Service) {
1014 return clientInfo.kWatchdogServiceHelper;
1015 }
1016 }
1017 return nullptr;
1018 }
1019
cacheVhalProcessIdentifier()1020 void WatchdogProcessService::cacheVhalProcessIdentifier() {
1021 // Ensure only one MSG_CACHE_VHAL_PROCESS_IDENTIFIER is present on the looper at any given time.
1022 // Duplicate messages could be posted when the CarService restarts during the caching attempts.
1023 // When duplicate messages are present, the following retry delay won't have any effect.
1024 mHandlerLooper->removeMessages(mMessageHandler, MSG_CACHE_VHAL_PROCESS_IDENTIFIER);
1025 bool isAidlVhal;
1026 sp<WatchdogServiceHelperInterface> serviceHelper;
1027 {
1028 Mutex::Autolock lock(mMutex);
1029 if (mVhalService == nullptr || mVhalProcessIdentifier.has_value()) {
1030 return;
1031 }
1032 isAidlVhal = mVhalService->isAidlVhal();
1033 serviceHelper = getWatchdogServiceHelperLocked();
1034 // WatchdogServiceHelper is available only when the CarWatchdogService
1035 // is connected. So, if the WatchdogServiceHelper is not available,
1036 // postpone requesting the AIDL VHAL process identifier from
1037 // CarWatchdogService until the daemon is connected with the service.
1038 if (isAidlVhal && serviceHelper == nullptr) {
1039 if (DEBUG) {
1040 ALOGE("Skipping requesting AIDL VHAL pid from CarWatchdogService until the service "
1041 "is connected");
1042 }
1043 return;
1044 }
1045 if (mTotalVhalPidCachingAttempts >= kMaxVhalPidCachingAttempts) {
1046 ALOGE("Failed to cache VHAL process identifier. Total attempts made to cache: %d",
1047 mTotalVhalPidCachingAttempts);
1048 return;
1049 }
1050 mTotalVhalPidCachingAttempts++;
1051 }
1052 const auto retryCaching = [&](const std::string& logMessage) {
1053 ALOGW("%s. Retrying caching VHAL pid in %lld ms", logMessage.c_str(),
1054 kVhalPidCachingRetryDelayNs.count() / (1'000'000));
1055 mHandlerLooper->sendMessageDelayed(kVhalPidCachingRetryDelayNs.count(), mMessageHandler,
1056 Message(MSG_CACHE_VHAL_PROCESS_IDENTIFIER));
1057 };
1058 if (isAidlVhal) {
1059 if (const auto status = serviceHelper->requestAidlVhalPid(); !status.isOk()) {
1060 retryCaching(StringPrintf("Failed to request AIDL VHAL pid from CarWatchdogService: %s",
1061 status.getMessage()));
1062 return;
1063 }
1064 // CarWatchdogService responds with the PID via an asynchronous callback. When
1065 // CarWatchdogService cannot respond with the PID, the daemon must retry caching the PID but
1066 // this needs to happen asynchronously. So, post a retry message to ensure that the AIDL
1067 // VHAL PID is returned by the CarWatchdogService within the retry timeout.
1068 retryCaching("Requested AIDL VHAL pid from CarWatchdogService");
1069 return;
1070 }
1071 Result<pid_t> result;
1072 sp<IServiceManager> hidlServiceManager = kTryGetHidlServiceManagerFunc();
1073 if (hidlServiceManager == nullptr) {
1074 retryCaching("Failed to get HIDL service manager");
1075 return;
1076 }
1077 if (result = queryHidlServiceManagerForVhalPid(hidlServiceManager); !result.ok()) {
1078 retryCaching(result.error().message());
1079 return;
1080 }
1081 cacheVhalProcessIdentifierForPid(*result);
1082 }
1083
onAidlVhalPidFetched(pid_t pid)1084 void WatchdogProcessService::onAidlVhalPidFetched(pid_t pid) {
1085 {
1086 Mutex::Autolock lock(mMutex);
1087 if (mVhalService == nullptr || !mVhalService->isAidlVhal()) {
1088 return;
1089 }
1090 }
1091 cacheVhalProcessIdentifierForPid(pid);
1092 }
1093
cacheVhalProcessIdentifierForPid(int32_t pid)1094 void WatchdogProcessService::cacheVhalProcessIdentifierForPid(int32_t pid) {
1095 if (pid < 0) {
1096 ALOGE("Ignoring request to cache invalid VHAL pid (%d)", pid);
1097 return;
1098 }
1099 ProcessIdentifier processIdentifier;
1100 processIdentifier.pid = pid;
1101 processIdentifier.startTimeMillis = kGetStartTimeForPidFunc(pid);
1102
1103 Mutex::Autolock lock(mMutex);
1104 mVhalProcessIdentifier = processIdentifier;
1105 mHandlerLooper->removeMessages(mMessageHandler, MSG_CACHE_VHAL_PROCESS_IDENTIFIER);
1106 }
1107
getNewSessionId()1108 int32_t WatchdogProcessService::getNewSessionId() {
1109 // Make sure that session id is always positive number.
1110 if (++mLastSessionId <= 0) {
1111 mLastSessionId = 1;
1112 }
1113 return mLastSessionId;
1114 }
1115
updateVhalHeartBeat(int64_t value)1116 void WatchdogProcessService::updateVhalHeartBeat(int64_t value) {
1117 bool wrongHeartBeat;
1118 {
1119 Mutex::Autolock lock(mMutex);
1120 if (!mIsEnabled) {
1121 return;
1122 }
1123 wrongHeartBeat = value <= mVhalHeartBeat.value;
1124 mVhalHeartBeat.eventTime = uptimeMillis();
1125 mVhalHeartBeat.value = value;
1126 }
1127 if (wrongHeartBeat) {
1128 ALOGW("VHAL updated heart beat with a wrong value. Terminating VHAL...");
1129 terminateVhal();
1130 return;
1131 }
1132 std::chrono::nanoseconds intervalNs = mVhalHealthCheckWindowMs + kHealthCheckDelayMs;
1133 mHandlerLooper->sendMessageDelayed(intervalNs.count(), mMessageHandler,
1134 Message(MSG_VHAL_HEALTH_CHECK));
1135 }
1136
checkVhalHealth()1137 void WatchdogProcessService::checkVhalHealth() {
1138 int64_t lastEventTime;
1139 int64_t currentUptime = uptimeMillis();
1140 {
1141 Mutex::Autolock lock(mMutex);
1142 if (mVhalService == nullptr || !mIsEnabled) {
1143 return;
1144 }
1145 lastEventTime = mVhalHeartBeat.eventTime;
1146 }
1147 if (currentUptime > lastEventTime + mVhalHealthCheckWindowMs.count()) {
1148 ALOGW("VHAL failed to update heart beat within timeout. Terminating VHAL...");
1149 terminateVhal();
1150 }
1151 }
1152
resetVhalInfoLocked()1153 void WatchdogProcessService::resetVhalInfoLocked() {
1154 mVhalService.reset();
1155 mVhalProcessIdentifier.reset();
1156 mTotalVhalPidCachingAttempts = 0;
1157 // Stop any pending caching attempts when the VHAL info is reset.
1158 mHandlerLooper->removeMessages(mMessageHandler, MSG_CACHE_VHAL_PROCESS_IDENTIFIER);
1159 }
1160
terminateVhal()1161 void WatchdogProcessService::terminateVhal() {
1162 std::optional<ProcessIdentifier> processIdentifier;
1163 {
1164 Mutex::Autolock lock(mMutex);
1165 processIdentifier = mVhalProcessIdentifier;
1166 resetVhalInfoLocked();
1167 if (!processIdentifier.has_value()) {
1168 ALOGE("Failed to terminate VHAL: failed to fetch VHAL PID");
1169 return;
1170 }
1171 }
1172 dumpAndKillAllProcesses(std::vector<ProcessIdentifier>(1, *processIdentifier),
1173 /*reportToVhal=*/false);
1174 }
1175
getTimeoutDurationNs(const TimeoutLength & timeout)1176 std::chrono::nanoseconds WatchdogProcessService::getTimeoutDurationNs(
1177 const TimeoutLength& timeout) {
1178 // When a default timeout has been overridden by the |kPropertyClientCheckInterval| read-only
1179 // property override the timeout value for all timeout lengths.
1180 if (mOverriddenClientHealthCheckWindowNs.has_value()) {
1181 return mOverriddenClientHealthCheckWindowNs.value();
1182 }
1183 switch (timeout) {
1184 case TimeoutLength::TIMEOUT_CRITICAL:
1185 return 3s; // 3s and no buffer time.
1186 case TimeoutLength::TIMEOUT_MODERATE:
1187 return 6s; // 5s + 1s as buffer time.
1188 case TimeoutLength::TIMEOUT_NORMAL:
1189 return 12s; // 10s + 2s as buffer time.
1190 }
1191 }
1192
toString() const1193 std::string WatchdogProcessService::ClientInfo::toString() const {
1194 std::string buffer;
1195 StringAppendF(&buffer, "pid = %d, userId = %d, type = %s", kPid, kUserId,
1196 kType == ClientType::Regular ? "regular" : "watchdog service");
1197 return buffer;
1198 }
1199
getAIBinder() const1200 AIBinder* WatchdogProcessService::ClientInfo::getAIBinder() const {
1201 if (kType == ClientType::Regular) {
1202 return kClient->asBinder().get();
1203 }
1204 return kWatchdogServiceBinder.get();
1205 }
1206
linkToDeath(AIBinder_DeathRecipient * recipient) const1207 ScopedAStatus WatchdogProcessService::ClientInfo::linkToDeath(
1208 AIBinder_DeathRecipient* recipient) const {
1209 if (kType == ClientType::Regular) {
1210 AIBinder* aiBinder = getAIBinder();
1211 return kService.mDeathRegistrationWrapper->linkToDeath(aiBinder, recipient,
1212 static_cast<void*>(aiBinder));
1213 }
1214 // WatchdogServiceHelper is the binder death recipient for watchdog service, ergo
1215 // skip this step.
1216 return ScopedAStatus::ok();
1217 }
1218
unlinkToDeath(AIBinder_DeathRecipient * recipient) const1219 ScopedAStatus WatchdogProcessService::ClientInfo::unlinkToDeath(
1220 AIBinder_DeathRecipient* recipient) const {
1221 if (kType == ClientType::Regular) {
1222 AIBinder* aiBinder = getAIBinder();
1223 return kService.mDeathRegistrationWrapper->unlinkToDeath(aiBinder, recipient,
1224 static_cast<void*>(aiBinder));
1225 }
1226 // WatchdogServiceHelper is the binder death recipient for watchdog service, ergo
1227 // skip this step.
1228 return ScopedAStatus::ok();
1229 }
1230
checkIfAlive(TimeoutLength timeout) const1231 ScopedAStatus WatchdogProcessService::ClientInfo::checkIfAlive(TimeoutLength timeout) const {
1232 if (kType == ClientType::Regular) {
1233 return kClient->checkIfAlive(sessionId, timeout);
1234 }
1235 return kWatchdogServiceHelper->checkIfAlive(kWatchdogServiceBinder, sessionId, timeout);
1236 }
1237
prepareProcessTermination() const1238 ScopedAStatus WatchdogProcessService::ClientInfo::prepareProcessTermination() const {
1239 if (kType == ClientType::Regular) {
1240 return kClient->prepareProcessTermination();
1241 }
1242 return kWatchdogServiceHelper->prepareProcessTermination(kWatchdogServiceBinder);
1243 }
1244
onPropertyEvent(const std::vector<std::unique_ptr<IHalPropValue>> & propValues)1245 void WatchdogProcessService::PropertyChangeListener::onPropertyEvent(
1246 const std::vector<std::unique_ptr<IHalPropValue>>& propValues) {
1247 for (const auto& value : propValues) {
1248 if (value->getPropId() == static_cast<int32_t>(VehicleProperty::VHAL_HEARTBEAT)) {
1249 if (value->getInt64Values().size() < 1) {
1250 ALOGE("Invalid VHAL_HEARTBEAT value, empty value");
1251 } else {
1252 kService->updateVhalHeartBeat(value->getInt64Values()[0]);
1253 }
1254 break;
1255 }
1256 }
1257 }
1258
onPropertySetError(const std::vector<HalPropError> & errors)1259 void WatchdogProcessService::PropertyChangeListener::onPropertySetError(
1260 const std::vector<HalPropError>& errors) {
1261 for (const auto& error : errors) {
1262 if (error.propId != static_cast<int32_t>(VehicleProperty::WATCHDOG_ALIVE) &&
1263 error.propId != static_cast<int32_t>(VehicleProperty::WATCHDOG_TERMINATED_PROCESS)) {
1264 continue;
1265 }
1266 ALOGE("failed to set VHAL property, prop ID: %d, status: %d", error.propId,
1267 static_cast<int32_t>(error.status));
1268 }
1269 }
1270
handleMessage(const Message & message)1271 void WatchdogProcessService::MessageHandlerImpl::handleMessage(const Message& message) {
1272 switch (message.what) {
1273 case static_cast<int>(TimeoutLength::TIMEOUT_CRITICAL):
1274 case static_cast<int>(TimeoutLength::TIMEOUT_MODERATE):
1275 case static_cast<int>(TimeoutLength::TIMEOUT_NORMAL):
1276 kService->doHealthCheck(message.what);
1277 break;
1278 case MSG_VHAL_WATCHDOG_ALIVE:
1279 kService->reportWatchdogAliveToVhal();
1280 break;
1281 case MSG_VHAL_HEALTH_CHECK:
1282 kService->checkVhalHealth();
1283 break;
1284 case MSG_CACHE_VHAL_PROCESS_IDENTIFIER:
1285 kService->cacheVhalProcessIdentifier();
1286 break;
1287 default:
1288 ALOGW("Unknown message: %d", message.what);
1289 }
1290 }
1291
1292 } // namespace watchdog
1293 } // namespace automotive
1294 } // namespace android
1295