1 /* 2 * Copyright (C) 2021 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.car.watchdog; 18 19 import static android.car.watchdog.CarWatchdogManager.TIMEOUT_CRITICAL; 20 import static android.car.watchdog.CarWatchdogManager.TIMEOUT_MODERATE; 21 import static android.car.watchdog.CarWatchdogManager.TIMEOUT_NORMAL; 22 23 import static com.android.car.CarServiceUtils.getHandlerThread; 24 import static com.android.car.internal.ExcludeFromCodeCoverageGeneratedReport.DUMP_INFO; 25 26 import android.annotation.NonNull; 27 import android.annotation.UserIdInt; 28 import android.automotive.watchdog.internal.ICarWatchdogServiceForSystem; 29 import android.automotive.watchdog.internal.ProcessIdentifier; 30 import android.car.builtin.util.Slogf; 31 import android.car.watchdog.ICarWatchdogServiceCallback; 32 import android.car.watchdoglib.CarWatchdogDaemonHelper; 33 import android.os.Binder; 34 import android.os.Handler; 35 import android.os.IBinder; 36 import android.os.Looper; 37 import android.os.RemoteException; 38 import android.os.SystemClock; 39 import android.os.SystemProperties; 40 import android.util.SparseArray; 41 import android.util.SparseBooleanArray; 42 43 import com.android.car.CarServiceHelperWrapper; 44 import com.android.car.internal.ExcludeFromCodeCoverageGeneratedReport; 45 import com.android.car.internal.util.IndentingPrintWriter; 46 import com.android.internal.annotations.GuardedBy; 47 48 import java.util.ArrayList; 49 import java.util.List; 50 import java.util.Optional; 51 52 /** 53 * Handles clients' health status checking and reporting the statuses to the watchdog daemon. 54 */ 55 public final class WatchdogProcessHandler { 56 static final String PROPERTY_RO_CLIENT_HEALTHCHECK_INTERVAL = 57 "ro.carwatchdog.client_healthcheck.interval"; 58 static final int MISSING_INT_PROPERTY_VALUE = -1; 59 60 private static final int[] ALL_TIMEOUTS = 61 { TIMEOUT_CRITICAL, TIMEOUT_MODERATE, TIMEOUT_NORMAL }; 62 63 private final ICarWatchdogServiceForSystem mWatchdogServiceForSystem; 64 private final CarWatchdogDaemonHelper mCarWatchdogDaemonHelper; 65 private final Handler mMainHandler = new Handler(Looper.getMainLooper()); 66 private final Handler mServiceHandler = new Handler(getHandlerThread( 67 CarWatchdogService.class.getSimpleName()).getLooper()); 68 private final Object mLock = new Object(); 69 /* 70 * Keeps the list of car watchdog client according to timeout: 71 * key => timeout, value => ClientInfo list. 72 * The value of SparseArray is guarded by mLock. 73 */ 74 @GuardedBy("mLock") 75 private final SparseArray<ArrayList<ClientInfo>> mClientMap = new SparseArray<>(); 76 /* 77 * Keeps the map of car watchdog client being checked by CarWatchdogService according to 78 * timeout: key => timeout, value => ClientInfo map. 79 * The value is also a map: key => session id, value => ClientInfo. 80 */ 81 @GuardedBy("mLock") 82 private final SparseArray<SparseArray<ClientInfo>> mPingedClientMap = new SparseArray<>(); 83 /* 84 * Keeps whether client health checking is being performed according to timeout: 85 * key => timeout, value => boolean (whether client health checking is being performed). 86 * The value of SparseArray is guarded by mLock. 87 */ 88 @GuardedBy("mLock") 89 private final SparseArray<Boolean> mClientCheckInProgress = new SparseArray<>(); 90 @GuardedBy("mLock") 91 private final ArrayList<ClientInfo> mClientsNotResponding = new ArrayList<>(); 92 // mLastSessionId should only be accessed from the main thread. 93 @GuardedBy("mLock") 94 private int mLastSessionId; 95 @GuardedBy("mLock") 96 private final SparseBooleanArray mStoppedUser = new SparseBooleanArray(); 97 98 private long mOverriddenClientHealthCheckWindowMs = MISSING_INT_PROPERTY_VALUE; 99 WatchdogProcessHandler(ICarWatchdogServiceForSystem serviceImpl, CarWatchdogDaemonHelper daemonHelper)100 public WatchdogProcessHandler(ICarWatchdogServiceForSystem serviceImpl, 101 CarWatchdogDaemonHelper daemonHelper) { 102 mWatchdogServiceForSystem = serviceImpl; 103 mCarWatchdogDaemonHelper = daemonHelper; 104 } 105 106 /** Initializes the handler. */ init()107 public void init() { 108 synchronized (mLock) { 109 for (int timeout : ALL_TIMEOUTS) { 110 mClientMap.put(timeout, new ArrayList<ClientInfo>()); 111 mPingedClientMap.put(timeout, new SparseArray<ClientInfo>()); 112 mClientCheckInProgress.put(timeout, false); 113 } 114 } 115 // Overridden timeout value must be greater than or equal to the maximum possible timeout 116 // value. Otherwise, clients will be pinged more frequently than the guaranteed timeout 117 // duration. 118 int clientHealthCheckWindowSec = SystemProperties.getInt( 119 PROPERTY_RO_CLIENT_HEALTHCHECK_INTERVAL, MISSING_INT_PROPERTY_VALUE); 120 if (clientHealthCheckWindowSec != MISSING_INT_PROPERTY_VALUE) { 121 mOverriddenClientHealthCheckWindowMs = Math.max(clientHealthCheckWindowSec * 1000L, 122 getTimeoutDurationMs(TIMEOUT_NORMAL)); 123 } 124 if (CarWatchdogService.DEBUG) { 125 Slogf.d(CarWatchdogService.TAG, "WatchdogProcessHandler is initialized"); 126 } 127 } 128 129 /** Dumps its state. */ 130 @ExcludeFromCodeCoverageGeneratedReport(reason = DUMP_INFO) dump(IndentingPrintWriter writer)131 public void dump(IndentingPrintWriter writer) { 132 synchronized (mLock) { 133 writer.println("Registered clients"); 134 writer.increaseIndent(); 135 int count = 1; 136 for (int timeout : ALL_TIMEOUTS) { 137 ArrayList<ClientInfo> clients = mClientMap.get(timeout); 138 String timeoutStr = timeoutToString(timeout); 139 for (ClientInfo clientInfo : clients) { 140 writer.printf("client #%d: timeout = %s, pid = %d\n", count++, timeoutStr, 141 clientInfo.pid); 142 } 143 } 144 writer.printf("Stopped users: "); 145 int size = mStoppedUser.size(); 146 if (size > 0) { 147 writer.printf("%d", mStoppedUser.keyAt(0)); 148 for (int i = 1; i < size; i++) { 149 writer.printf(", %d", mStoppedUser.keyAt(i)); 150 } 151 writer.println(); 152 } else { 153 writer.println("none"); 154 } 155 writer.decreaseIndent(); 156 } 157 } 158 159 /** Registers the client callback */ registerClient(ICarWatchdogServiceCallback client, int timeout)160 public void registerClient(ICarWatchdogServiceCallback client, int timeout) { 161 synchronized (mLock) { 162 ArrayList<ClientInfo> clients = mClientMap.get(timeout); 163 if (clients == null) { 164 Slogf.w(CarWatchdogService.TAG, "Cannot register the client: invalid timeout"); 165 return; 166 } 167 IBinder binder = client.asBinder(); 168 for (int i = 0; i < clients.size(); i++) { 169 ClientInfo clientInfo = clients.get(i); 170 if (binder == clientInfo.client.asBinder()) { 171 Slogf.w(CarWatchdogService.TAG, 172 "Cannot register the client: the client(pid: %d) has been already " 173 + "registered", clientInfo.pid); 174 return; 175 } 176 } 177 int pid = Binder.getCallingPid(); 178 int userId = Binder.getCallingUserHandle().getIdentifier(); 179 ClientInfo clientInfo = new ClientInfo(client, pid, userId, timeout); 180 try { 181 clientInfo.linkToDeath(); 182 } catch (RemoteException e) { 183 Slogf.w(CarWatchdogService.TAG, 184 "Cannot register the client: linkToDeath to the client failed"); 185 return; 186 } 187 clients.add(clientInfo); 188 if (CarWatchdogService.DEBUG) { 189 Slogf.d(CarWatchdogService.TAG, "Registered client: %s", clientInfo); 190 } 191 } 192 } 193 194 /** Unregisters the previously registered client callback */ unregisterClient(ICarWatchdogServiceCallback client)195 public void unregisterClient(ICarWatchdogServiceCallback client) { 196 ClientInfo clientInfo; 197 synchronized (mLock) { 198 IBinder binder = client.asBinder(); 199 // Even if a client did not respond to the latest ping, CarWatchdogService should honor 200 // the unregister request at this point and remove it from all internal caches. 201 // Otherwise, the client might be killed even after unregistering. 202 Optional<ClientInfo> optionalClientInfo = removeFromClientMapsLocked(binder); 203 if (optionalClientInfo.isEmpty()) { 204 Slogf.w(CarWatchdogService.TAG, 205 "Cannot unregister the client: the client has not been registered before"); 206 return; 207 } 208 clientInfo = optionalClientInfo.get(); 209 for (int i = 0; i < mClientsNotResponding.size(); i++) { 210 ClientInfo notRespondingClientInfo = mClientsNotResponding.get(i); 211 if (binder == notRespondingClientInfo.client.asBinder()) { 212 mClientsNotResponding.remove(i); 213 break; 214 } 215 } 216 } 217 if (CarWatchdogService.DEBUG) { 218 Slogf.d(CarWatchdogService.TAG, "Unregistered client: %s", clientInfo); 219 } 220 } 221 222 @GuardedBy("mLock") removeFromClientMapsLocked(IBinder binder)223 private Optional<ClientInfo> removeFromClientMapsLocked(IBinder binder) { 224 for (int timeout : ALL_TIMEOUTS) { 225 ArrayList<ClientInfo> clients = mClientMap.get(timeout); 226 for (int i = 0; i < clients.size(); i++) { 227 ClientInfo clientInfo = clients.get(i); 228 if (binder != clientInfo.client.asBinder()) { 229 continue; 230 } 231 clientInfo.unlinkToDeath(); 232 clients.remove(i); 233 SparseArray<ClientInfo> pingedClients = mPingedClientMap.get(timeout); 234 if (pingedClients != null) { 235 pingedClients.remove(clientInfo.sessionId); 236 } 237 return Optional.of(clientInfo); 238 } 239 } 240 return Optional.empty(); 241 } 242 243 /** Tells the handler that the client is alive. */ tellClientAlive(ICarWatchdogServiceCallback client, int sessionId)244 public void tellClientAlive(ICarWatchdogServiceCallback client, int sessionId) { 245 synchronized (mLock) { 246 for (int timeout : ALL_TIMEOUTS) { 247 if (!mClientCheckInProgress.get(timeout)) { 248 continue; 249 } 250 SparseArray<ClientInfo> pingedClients = mPingedClientMap.get(timeout); 251 ClientInfo clientInfo = pingedClients.get(sessionId); 252 if (clientInfo != null && clientInfo.client.asBinder() == client.asBinder()) { 253 pingedClients.remove(sessionId); 254 return; 255 } 256 } 257 } 258 } 259 260 /** Updates the user stopped state */ updateUserState(@serIdInt int userId, boolean isStopped)261 public void updateUserState(@UserIdInt int userId, boolean isStopped) { 262 synchronized (mLock) { 263 if (isStopped) { 264 mStoppedUser.put(userId, true); 265 } else { 266 mStoppedUser.delete(userId); 267 } 268 } 269 } 270 271 /** Posts health check message */ postHealthCheckMessage(int sessionId)272 public void postHealthCheckMessage(int sessionId) { 273 mMainHandler.postAtFrontOfQueue(() -> doHealthCheck(sessionId)); 274 } 275 276 /** Returns the registered and alive client count. */ getClientCount(int timeout)277 public int getClientCount(int timeout) { 278 synchronized (mLock) { 279 ArrayList<ClientInfo> clients = mClientMap.get(timeout); 280 return clients != null ? clients.size() : 0; 281 } 282 } 283 284 /** Resets pinged clients before health checking */ prepareHealthCheck()285 public void prepareHealthCheck() { 286 synchronized (mLock) { 287 for (int timeout : ALL_TIMEOUTS) { 288 SparseArray<ClientInfo> pingedClients = mPingedClientMap.get(timeout); 289 pingedClients.clear(); 290 } 291 } 292 } 293 294 /** 295 * Asynchronously fetches the AIDL VHAL pid from SystemServer. 296 * 297 * On fetching the AIDL VHAL pid, car watchdog daemon is updated via an async callback. 298 */ asyncFetchAidlVhalPid()299 public void asyncFetchAidlVhalPid() { 300 mServiceHandler.post(() -> { 301 int pid = CarServiceHelperWrapper.getInstance().fetchAidlVhalPid(); 302 if (pid < 0) { 303 Slogf.e(CarWatchdogService.TAG, "Failed to fetch AIDL VHAL pid from" 304 + " CarServiceHelperService"); 305 return; 306 } 307 try { 308 mCarWatchdogDaemonHelper.onAidlVhalPidFetched(pid); 309 } catch (RemoteException e) { 310 Slogf.e(CarWatchdogService.TAG, 311 "Failed to notify car watchdog daemon of the AIDL VHAL pid"); 312 } 313 }); 314 } 315 316 /** Enables/disables the watchdog daemon client health check process. */ controlProcessHealthCheck(boolean enable)317 void controlProcessHealthCheck(boolean enable) { 318 try { 319 mCarWatchdogDaemonHelper.controlProcessHealthCheck(enable); 320 } catch (RemoteException e) { 321 Slogf.w(CarWatchdogService.TAG, 322 "Cannot enable/disable the car watchdog daemon health check process: %s", e); 323 } 324 } 325 onClientDeath(ICarWatchdogServiceCallback client, int timeout)326 private void onClientDeath(ICarWatchdogServiceCallback client, int timeout) { 327 synchronized (mLock) { 328 removeClientLocked(client.asBinder(), timeout); 329 } 330 } 331 doHealthCheck(int sessionId)332 private void doHealthCheck(int sessionId) { 333 // For critical clients, the response status are checked just before reporting to car 334 // watchdog daemon. For moderate and normal clients, the status are checked after allowed 335 // delay per timeout. 336 analyzeClientResponse(TIMEOUT_CRITICAL); 337 reportHealthCheckResult(sessionId); 338 sendPingToClients(TIMEOUT_CRITICAL); 339 sendPingToClientsAndCheck(TIMEOUT_MODERATE); 340 sendPingToClientsAndCheck(TIMEOUT_NORMAL); 341 } 342 analyzeClientResponse(int timeout)343 private void analyzeClientResponse(int timeout) { 344 // Clients which are not responding are stored in mClientsNotResponding, and will be dumped 345 // and killed at the next response of CarWatchdogService to car watchdog daemon. 346 synchronized (mLock) { 347 SparseArray<ClientInfo> pingedClients = mPingedClientMap.get(timeout); 348 for (int i = 0; i < pingedClients.size(); i++) { 349 ClientInfo clientInfo = pingedClients.valueAt(i); 350 if (mStoppedUser.get(clientInfo.userId)) { 351 continue; 352 } 353 mClientsNotResponding.add(clientInfo); 354 removeClientLocked(clientInfo.client.asBinder(), timeout); 355 } 356 mClientCheckInProgress.setValueAt(timeout, false); 357 } 358 } 359 sendPingToClients(int timeout)360 private void sendPingToClients(int timeout) { 361 ArrayList<ClientInfo> clientsToCheck; 362 synchronized (mLock) { 363 SparseArray<ClientInfo> pingedClients = mPingedClientMap.get(timeout); 364 pingedClients.clear(); 365 clientsToCheck = new ArrayList<>(mClientMap.get(timeout)); 366 for (int i = 0; i < clientsToCheck.size(); i++) { 367 ClientInfo clientInfo = clientsToCheck.get(i); 368 if (mStoppedUser.get(clientInfo.userId)) { 369 continue; 370 } 371 int sessionId = getNewSessionId(); 372 clientInfo.sessionId = sessionId; 373 pingedClients.put(sessionId, clientInfo); 374 } 375 mClientCheckInProgress.setValueAt(timeout, true); 376 } 377 378 for (int i = 0; i < clientsToCheck.size(); i++) { 379 ClientInfo clientInfo = clientsToCheck.get(i); 380 try { 381 clientInfo.client.onCheckHealthStatus(clientInfo.sessionId, timeout); 382 } catch (RemoteException e) { 383 Slogf.w(CarWatchdogService.TAG, 384 "Sending a ping message to client(pid: %d) failed: %s", 385 clientInfo.pid, e); 386 synchronized (mLock) { 387 mPingedClientMap.get(timeout).remove(clientInfo.sessionId); 388 } 389 } 390 } 391 } 392 sendPingToClientsAndCheck(int timeout)393 private void sendPingToClientsAndCheck(int timeout) { 394 synchronized (mLock) { 395 if (mClientCheckInProgress.get(timeout)) { 396 return; 397 } 398 } 399 sendPingToClients(timeout); 400 mMainHandler.postDelayed( 401 () -> analyzeClientResponse(timeout), getTimeoutDurationMs(timeout)); 402 } 403 getNewSessionId()404 private int getNewSessionId() { 405 synchronized (mLock) { 406 if (++mLastSessionId <= 0) { 407 mLastSessionId = 1; 408 } 409 return mLastSessionId; 410 } 411 } 412 413 @GuardedBy("mLock") removeClientLocked(IBinder clientBinder, int timeout)414 private void removeClientLocked(IBinder clientBinder, int timeout) { 415 ArrayList<ClientInfo> clients = mClientMap.get(timeout); 416 for (int i = 0; i < clients.size(); i++) { 417 ClientInfo clientInfo = clients.get(i); 418 if (clientBinder == clientInfo.client.asBinder()) { 419 clients.remove(i); 420 return; 421 } 422 } 423 } 424 reportHealthCheckResult(int sessionId)425 private void reportHealthCheckResult(int sessionId) { 426 List<ProcessIdentifier> clientsNotResponding; 427 ArrayList<ClientInfo> clientsToNotify; 428 synchronized (mLock) { 429 clientsNotResponding = toProcessIdentifierList(mClientsNotResponding); 430 clientsToNotify = new ArrayList<>(mClientsNotResponding); 431 mClientsNotResponding.clear(); 432 } 433 for (int i = 0; i < clientsToNotify.size(); i++) { 434 ClientInfo clientInfo = clientsToNotify.get(i); 435 try { 436 clientInfo.client.onPrepareProcessTermination(); 437 } catch (RemoteException e) { 438 Slogf.w(CarWatchdogService.TAG, 439 "Notifying onPrepareProcessTermination to client(pid: %d) failed: %s", 440 clientInfo.pid, e); 441 } 442 } 443 444 try { 445 mCarWatchdogDaemonHelper.tellCarWatchdogServiceAlive( 446 mWatchdogServiceForSystem, clientsNotResponding, sessionId); 447 } catch (RemoteException | RuntimeException e) { 448 Slogf.w(CarWatchdogService.TAG, 449 "Cannot respond to car watchdog daemon (sessionId=%d): %s", sessionId, e); 450 } 451 } 452 453 @NonNull toProcessIdentifierList( @onNull ArrayList<ClientInfo> clientInfos)454 private List<ProcessIdentifier> toProcessIdentifierList( 455 @NonNull ArrayList<ClientInfo> clientInfos) { 456 List<ProcessIdentifier> processIdentifiers = new ArrayList<>(clientInfos.size()); 457 for (int i = 0; i < clientInfos.size(); i++) { 458 ClientInfo clientInfo = clientInfos.get(i); 459 ProcessIdentifier processIdentifier = new ProcessIdentifier(); 460 processIdentifier.pid = clientInfo.pid; 461 processIdentifier.startTimeMillis = clientInfo.startTimeMillis; 462 processIdentifiers.add(processIdentifier); 463 } 464 return processIdentifiers; 465 } 466 timeoutToString(int timeout)467 private String timeoutToString(int timeout) { 468 switch (timeout) { 469 case TIMEOUT_CRITICAL: 470 return "critical"; 471 case TIMEOUT_MODERATE: 472 return "moderate"; 473 case TIMEOUT_NORMAL: 474 return "normal"; 475 default: 476 Slogf.w(CarWatchdogService.TAG, "Unknown timeout value"); 477 return "unknown"; 478 } 479 } 480 getTimeoutDurationMs(int timeout)481 private long getTimeoutDurationMs(int timeout) { 482 if (mOverriddenClientHealthCheckWindowMs != MISSING_INT_PROPERTY_VALUE) { 483 return mOverriddenClientHealthCheckWindowMs; 484 } 485 switch (timeout) { 486 case TIMEOUT_CRITICAL: 487 return 3000L; 488 case TIMEOUT_MODERATE: 489 return 5000L; 490 case TIMEOUT_NORMAL: 491 return 10000L; 492 default: 493 Slogf.w(CarWatchdogService.TAG, "Unknown timeout value"); 494 return 10000L; 495 } 496 } 497 498 private final class ClientInfo implements IBinder.DeathRecipient { 499 public final ICarWatchdogServiceCallback client; 500 public final int pid; 501 public final long startTimeMillis; 502 @UserIdInt public final int userId; 503 public final int timeout; 504 public volatile int sessionId; 505 ClientInfo(ICarWatchdogServiceCallback client, int pid, @UserIdInt int userId, int timeout)506 ClientInfo(ICarWatchdogServiceCallback client, int pid, @UserIdInt int userId, 507 int timeout) { 508 this.client = client; 509 this.pid = pid; 510 // CarService doesn't have sepolicy access to read per-pid proc files, so it cannot 511 // fetch the pid's actual start time. When a client process registers with 512 // the CarService, it is safe to assume the process is still alive. So, populate 513 // elapsed real time and the consumer (CarServiceHelperService) of this data should 514 // verify that the actual start time is less than the reported start time. 515 this.startTimeMillis = SystemClock.elapsedRealtime(); 516 this.userId = userId; 517 this.timeout = timeout; 518 } 519 520 @Override binderDied()521 public void binderDied() { 522 Slogf.w(CarWatchdogService.TAG, "Client(pid: %d) died", pid); 523 onClientDeath(client, timeout); 524 } 525 linkToDeath()526 private void linkToDeath() throws RemoteException { 527 client.asBinder().linkToDeath(this, 0); 528 } 529 unlinkToDeath()530 private void unlinkToDeath() { 531 client.asBinder().unlinkToDeath(this, 0); 532 } 533 534 @Override toString()535 public String toString() { 536 return "ClientInfo{client=" + client + ", pid=" + pid + ", startTimeMillis=" 537 + startTimeMillis + ", userId=" + userId + ", timeout=" + timeout 538 + ", sessionId=" + sessionId + '}'; 539 } 540 } 541 } 542