1 /* 2 * Copyright (C) 2021 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.car.watchdog; 18 19 import static android.car.watchdog.CarWatchdogManager.TIMEOUT_CRITICAL; 20 import static android.car.watchdog.CarWatchdogManager.TIMEOUT_MODERATE; 21 import static android.car.watchdog.CarWatchdogManager.TIMEOUT_NORMAL; 22 23 import static com.android.car.internal.ExcludeFromCodeCoverageGeneratedReport.DUMP_INFO; 24 25 import android.annotation.NonNull; 26 import android.annotation.UserIdInt; 27 import android.automotive.watchdog.internal.ICarWatchdogServiceForSystem; 28 import android.automotive.watchdog.internal.ProcessIdentifier; 29 import android.car.builtin.util.Slogf; 30 import android.car.watchdog.ICarWatchdogServiceCallback; 31 import android.car.watchdoglib.CarWatchdogDaemonHelper; 32 import android.os.Binder; 33 import android.os.Handler; 34 import android.os.IBinder; 35 import android.os.Looper; 36 import android.os.RemoteException; 37 import android.os.SystemClock; 38 import android.os.SystemProperties; 39 import android.util.SparseArray; 40 import android.util.SparseBooleanArray; 41 42 import com.android.car.internal.ExcludeFromCodeCoverageGeneratedReport; 43 import com.android.car.internal.util.IndentingPrintWriter; 44 import com.android.internal.annotations.GuardedBy; 45 46 import java.util.ArrayList; 47 import java.util.List; 48 import java.util.Optional; 49 50 /** 51 * Handles clients' health status checking and reporting the statuses to the watchdog daemon. 52 */ 53 public final class WatchdogProcessHandler { 54 static final String PROPERTY_RO_CLIENT_HEALTHCHECK_INTERVAL = 55 "ro.carwatchdog.client_healthcheck.interval"; 56 static final int MISSING_INT_PROPERTY_VALUE = -1; 57 58 private static final int[] ALL_TIMEOUTS = 59 { TIMEOUT_CRITICAL, TIMEOUT_MODERATE, TIMEOUT_NORMAL }; 60 61 private final ICarWatchdogServiceForSystem mWatchdogServiceForSystem; 62 private final CarWatchdogDaemonHelper mCarWatchdogDaemonHelper; 63 private final Handler mMainHandler = new Handler(Looper.getMainLooper()); 64 private final Object mLock = new Object(); 65 /* 66 * Keeps the list of car watchdog client according to timeout: 67 * key => timeout, value => ClientInfo list. 68 * The value of SparseArray is guarded by mLock. 69 */ 70 @GuardedBy("mLock") 71 private final SparseArray<ArrayList<ClientInfo>> mClientMap = new SparseArray<>(); 72 /* 73 * Keeps the map of car watchdog client being checked by CarWatchdogService according to 74 * timeout: key => timeout, value => ClientInfo map. 75 * The value is also a map: key => session id, value => ClientInfo. 76 */ 77 @GuardedBy("mLock") 78 private final SparseArray<SparseArray<ClientInfo>> mPingedClientMap = new SparseArray<>(); 79 /* 80 * Keeps whether client health checking is being performed according to timeout: 81 * key => timeout, value => boolean (whether client health checking is being performed). 82 * The value of SparseArray is guarded by mLock. 83 */ 84 @GuardedBy("mLock") 85 private final SparseArray<Boolean> mClientCheckInProgress = new SparseArray<>(); 86 @GuardedBy("mLock") 87 private final ArrayList<ClientInfo> mClientsNotResponding = new ArrayList<>(); 88 // mLastSessionId should only be accessed from the main thread. 89 @GuardedBy("mLock") 90 private int mLastSessionId; 91 @GuardedBy("mLock") 92 private final SparseBooleanArray mStoppedUser = new SparseBooleanArray(); 93 94 private long mOverriddenClientHealthCheckWindowMs = MISSING_INT_PROPERTY_VALUE; 95 WatchdogProcessHandler(ICarWatchdogServiceForSystem serviceImpl, CarWatchdogDaemonHelper daemonHelper)96 public WatchdogProcessHandler(ICarWatchdogServiceForSystem serviceImpl, 97 CarWatchdogDaemonHelper daemonHelper) { 98 mWatchdogServiceForSystem = serviceImpl; 99 mCarWatchdogDaemonHelper = daemonHelper; 100 } 101 102 /** Initializes the handler. */ init()103 public void init() { 104 synchronized (mLock) { 105 for (int timeout : ALL_TIMEOUTS) { 106 mClientMap.put(timeout, new ArrayList<ClientInfo>()); 107 mPingedClientMap.put(timeout, new SparseArray<ClientInfo>()); 108 mClientCheckInProgress.put(timeout, false); 109 } 110 } 111 // Overridden timeout value must be greater than or equal to the maximum possible timeout 112 // value. Otherwise, clients will be pinged more frequently than the guaranteed timeout 113 // duration. 114 int clientHealthCheckWindowSec = SystemProperties.getInt( 115 PROPERTY_RO_CLIENT_HEALTHCHECK_INTERVAL, MISSING_INT_PROPERTY_VALUE); 116 if (clientHealthCheckWindowSec != MISSING_INT_PROPERTY_VALUE) { 117 mOverriddenClientHealthCheckWindowMs = Math.max(clientHealthCheckWindowSec * 1000L, 118 getTimeoutDurationMs(TIMEOUT_NORMAL)); 119 } 120 if (CarWatchdogService.DEBUG) { 121 Slogf.d(CarWatchdogService.TAG, "WatchdogProcessHandler is initialized"); 122 } 123 } 124 125 /** Dumps its state. */ 126 @ExcludeFromCodeCoverageGeneratedReport(reason = DUMP_INFO) dump(IndentingPrintWriter writer)127 public void dump(IndentingPrintWriter writer) { 128 synchronized (mLock) { 129 writer.println("Registered clients"); 130 writer.increaseIndent(); 131 int count = 1; 132 for (int timeout : ALL_TIMEOUTS) { 133 ArrayList<ClientInfo> clients = mClientMap.get(timeout); 134 String timeoutStr = timeoutToString(timeout); 135 for (ClientInfo clientInfo : clients) { 136 writer.printf("client #%d: timeout = %s, pid = %d\n", count++, timeoutStr, 137 clientInfo.pid); 138 } 139 } 140 writer.printf("Stopped users: "); 141 int size = mStoppedUser.size(); 142 if (size > 0) { 143 writer.printf("%d", mStoppedUser.keyAt(0)); 144 for (int i = 1; i < size; i++) { 145 writer.printf(", %d", mStoppedUser.keyAt(i)); 146 } 147 writer.println(); 148 } else { 149 writer.println("none"); 150 } 151 writer.decreaseIndent(); 152 } 153 } 154 155 /** Registers the client callback */ registerClient(ICarWatchdogServiceCallback client, int timeout)156 public void registerClient(ICarWatchdogServiceCallback client, int timeout) { 157 synchronized (mLock) { 158 ArrayList<ClientInfo> clients = mClientMap.get(timeout); 159 if (clients == null) { 160 Slogf.w(CarWatchdogService.TAG, "Cannot register the client: invalid timeout"); 161 return; 162 } 163 IBinder binder = client.asBinder(); 164 for (int i = 0; i < clients.size(); i++) { 165 ClientInfo clientInfo = clients.get(i); 166 if (binder == clientInfo.client.asBinder()) { 167 Slogf.w(CarWatchdogService.TAG, 168 "Cannot register the client: the client(pid: %d) has been already " 169 + "registered", clientInfo.pid); 170 return; 171 } 172 } 173 int pid = Binder.getCallingPid(); 174 int userId = Binder.getCallingUserHandle().getIdentifier(); 175 ClientInfo clientInfo = new ClientInfo(client, pid, userId, timeout); 176 try { 177 clientInfo.linkToDeath(); 178 } catch (RemoteException e) { 179 Slogf.w(CarWatchdogService.TAG, 180 "Cannot register the client: linkToDeath to the client failed"); 181 return; 182 } 183 clients.add(clientInfo); 184 if (CarWatchdogService.DEBUG) { 185 Slogf.d(CarWatchdogService.TAG, "Registered client: %s", clientInfo); 186 } 187 } 188 } 189 190 /** Unregisters the previously registered client callback */ unregisterClient(ICarWatchdogServiceCallback client)191 public void unregisterClient(ICarWatchdogServiceCallback client) { 192 ClientInfo clientInfo; 193 synchronized (mLock) { 194 IBinder binder = client.asBinder(); 195 // Even if a client did not respond to the latest ping, CarWatchdogService should honor 196 // the unregister request at this point and remove it from all internal caches. 197 // Otherwise, the client might be killed even after unregistering. 198 Optional<ClientInfo> optionalClientInfo = removeFromClientMapsLocked(binder); 199 if (optionalClientInfo.isEmpty()) { 200 Slogf.w(CarWatchdogService.TAG, 201 "Cannot unregister the client: the client has not been registered before"); 202 return; 203 } 204 clientInfo = optionalClientInfo.get(); 205 for (int i = 0; i < mClientsNotResponding.size(); i++) { 206 ClientInfo notRespondingClientInfo = mClientsNotResponding.get(i); 207 if (binder == notRespondingClientInfo.client.asBinder()) { 208 mClientsNotResponding.remove(i); 209 break; 210 } 211 } 212 } 213 if (CarWatchdogService.DEBUG) { 214 Slogf.d(CarWatchdogService.TAG, "Unregistered client: %s", clientInfo); 215 } 216 } 217 218 @GuardedBy("mLock") removeFromClientMapsLocked(IBinder binder)219 private Optional<ClientInfo> removeFromClientMapsLocked(IBinder binder) { 220 for (int timeout : ALL_TIMEOUTS) { 221 ArrayList<ClientInfo> clients = mClientMap.get(timeout); 222 for (int i = 0; i < clients.size(); i++) { 223 ClientInfo clientInfo = clients.get(i); 224 if (binder != clientInfo.client.asBinder()) { 225 continue; 226 } 227 clientInfo.unlinkToDeath(); 228 clients.remove(i); 229 SparseArray<ClientInfo> pingedClients = mPingedClientMap.get(timeout); 230 if (pingedClients != null) { 231 pingedClients.remove(clientInfo.sessionId); 232 } 233 return Optional.of(clientInfo); 234 } 235 } 236 return Optional.empty(); 237 } 238 239 /** Tells the handler that the client is alive. */ tellClientAlive(ICarWatchdogServiceCallback client, int sessionId)240 public void tellClientAlive(ICarWatchdogServiceCallback client, int sessionId) { 241 synchronized (mLock) { 242 for (int timeout : ALL_TIMEOUTS) { 243 if (!mClientCheckInProgress.get(timeout)) { 244 continue; 245 } 246 SparseArray<ClientInfo> pingedClients = mPingedClientMap.get(timeout); 247 ClientInfo clientInfo = pingedClients.get(sessionId); 248 if (clientInfo != null && clientInfo.client.asBinder() == client.asBinder()) { 249 pingedClients.remove(sessionId); 250 return; 251 } 252 } 253 } 254 } 255 256 /** Updates the user stopped state */ updateUserState(@serIdInt int userId, boolean isStopped)257 public void updateUserState(@UserIdInt int userId, boolean isStopped) { 258 synchronized (mLock) { 259 if (isStopped) { 260 mStoppedUser.put(userId, true); 261 } else { 262 mStoppedUser.delete(userId); 263 } 264 } 265 } 266 267 /** Posts health check message */ postHealthCheckMessage(int sessionId)268 public void postHealthCheckMessage(int sessionId) { 269 mMainHandler.postAtFrontOfQueue(() -> doHealthCheck(sessionId)); 270 } 271 272 /** Returns the registered and alive client count. */ getClientCount(int timeout)273 public int getClientCount(int timeout) { 274 synchronized (mLock) { 275 ArrayList<ClientInfo> clients = mClientMap.get(timeout); 276 return clients != null ? clients.size() : 0; 277 } 278 } 279 280 /** Resets pinged clients before health checking */ prepareHealthCheck()281 public void prepareHealthCheck() { 282 synchronized (mLock) { 283 for (int timeout : ALL_TIMEOUTS) { 284 SparseArray<ClientInfo> pingedClients = mPingedClientMap.get(timeout); 285 pingedClients.clear(); 286 } 287 } 288 } 289 290 /** Enables/disables the watchdog daemon client health check process. */ controlProcessHealthCheck(boolean enable)291 void controlProcessHealthCheck(boolean enable) { 292 try { 293 mCarWatchdogDaemonHelper.controlProcessHealthCheck(enable); 294 } catch (RemoteException e) { 295 Slogf.w(CarWatchdogService.TAG, 296 "Cannot enable/disable the car watchdog daemon health check process: %s", e); 297 } 298 } 299 onClientDeath(ICarWatchdogServiceCallback client, int timeout)300 private void onClientDeath(ICarWatchdogServiceCallback client, int timeout) { 301 synchronized (mLock) { 302 removeClientLocked(client.asBinder(), timeout); 303 } 304 } 305 doHealthCheck(int sessionId)306 private void doHealthCheck(int sessionId) { 307 // For critical clients, the response status are checked just before reporting to car 308 // watchdog daemon. For moderate and normal clients, the status are checked after allowed 309 // delay per timeout. 310 analyzeClientResponse(TIMEOUT_CRITICAL); 311 reportHealthCheckResult(sessionId); 312 sendPingToClients(TIMEOUT_CRITICAL); 313 sendPingToClientsAndCheck(TIMEOUT_MODERATE); 314 sendPingToClientsAndCheck(TIMEOUT_NORMAL); 315 } 316 analyzeClientResponse(int timeout)317 private void analyzeClientResponse(int timeout) { 318 // Clients which are not responding are stored in mClientsNotResponding, and will be dumped 319 // and killed at the next response of CarWatchdogService to car watchdog daemon. 320 synchronized (mLock) { 321 SparseArray<ClientInfo> pingedClients = mPingedClientMap.get(timeout); 322 for (int i = 0; i < pingedClients.size(); i++) { 323 ClientInfo clientInfo = pingedClients.valueAt(i); 324 if (mStoppedUser.get(clientInfo.userId)) { 325 continue; 326 } 327 mClientsNotResponding.add(clientInfo); 328 removeClientLocked(clientInfo.client.asBinder(), timeout); 329 } 330 mClientCheckInProgress.setValueAt(timeout, false); 331 } 332 } 333 sendPingToClients(int timeout)334 private void sendPingToClients(int timeout) { 335 ArrayList<ClientInfo> clientsToCheck; 336 synchronized (mLock) { 337 SparseArray<ClientInfo> pingedClients = mPingedClientMap.get(timeout); 338 pingedClients.clear(); 339 clientsToCheck = new ArrayList<>(mClientMap.get(timeout)); 340 for (int i = 0; i < clientsToCheck.size(); i++) { 341 ClientInfo clientInfo = clientsToCheck.get(i); 342 if (mStoppedUser.get(clientInfo.userId)) { 343 continue; 344 } 345 int sessionId = getNewSessionId(); 346 clientInfo.sessionId = sessionId; 347 pingedClients.put(sessionId, clientInfo); 348 } 349 mClientCheckInProgress.setValueAt(timeout, true); 350 } 351 352 for (int i = 0; i < clientsToCheck.size(); i++) { 353 ClientInfo clientInfo = clientsToCheck.get(i); 354 try { 355 clientInfo.client.onCheckHealthStatus(clientInfo.sessionId, timeout); 356 } catch (RemoteException e) { 357 Slogf.w(CarWatchdogService.TAG, 358 "Sending a ping message to client(pid: %d) failed: %s", 359 clientInfo.pid, e); 360 synchronized (mLock) { 361 mPingedClientMap.get(timeout).remove(clientInfo.sessionId); 362 } 363 } 364 } 365 } 366 sendPingToClientsAndCheck(int timeout)367 private void sendPingToClientsAndCheck(int timeout) { 368 synchronized (mLock) { 369 if (mClientCheckInProgress.get(timeout)) { 370 return; 371 } 372 } 373 sendPingToClients(timeout); 374 mMainHandler.postDelayed( 375 () -> analyzeClientResponse(timeout), getTimeoutDurationMs(timeout)); 376 } 377 getNewSessionId()378 private int getNewSessionId() { 379 synchronized (mLock) { 380 if (++mLastSessionId <= 0) { 381 mLastSessionId = 1; 382 } 383 return mLastSessionId; 384 } 385 } 386 387 @GuardedBy("mLock") removeClientLocked(IBinder clientBinder, int timeout)388 private void removeClientLocked(IBinder clientBinder, int timeout) { 389 ArrayList<ClientInfo> clients = mClientMap.get(timeout); 390 for (int i = 0; i < clients.size(); i++) { 391 ClientInfo clientInfo = clients.get(i); 392 if (clientBinder == clientInfo.client.asBinder()) { 393 clients.remove(i); 394 return; 395 } 396 } 397 } 398 reportHealthCheckResult(int sessionId)399 private void reportHealthCheckResult(int sessionId) { 400 List<ProcessIdentifier> clientsNotResponding; 401 ArrayList<ClientInfo> clientsToNotify; 402 synchronized (mLock) { 403 clientsNotResponding = toProcessIdentifierList(mClientsNotResponding); 404 clientsToNotify = new ArrayList<>(mClientsNotResponding); 405 mClientsNotResponding.clear(); 406 } 407 for (int i = 0; i < clientsToNotify.size(); i++) { 408 ClientInfo clientInfo = clientsToNotify.get(i); 409 try { 410 clientInfo.client.onPrepareProcessTermination(); 411 } catch (RemoteException e) { 412 Slogf.w(CarWatchdogService.TAG, 413 "Notifying onPrepareProcessTermination to client(pid: %d) failed: %s", 414 clientInfo.pid, e); 415 } 416 } 417 418 try { 419 mCarWatchdogDaemonHelper.tellCarWatchdogServiceAlive( 420 mWatchdogServiceForSystem, clientsNotResponding, sessionId); 421 } catch (RemoteException | RuntimeException e) { 422 Slogf.w(CarWatchdogService.TAG, 423 "Cannot respond to car watchdog daemon (sessionId=%d): %s", sessionId, e); 424 } 425 } 426 427 @NonNull toProcessIdentifierList( @onNull ArrayList<ClientInfo> clientInfos)428 private List<ProcessIdentifier> toProcessIdentifierList( 429 @NonNull ArrayList<ClientInfo> clientInfos) { 430 List<ProcessIdentifier> processIdentifiers = new ArrayList<>(clientInfos.size()); 431 for (int i = 0; i < clientInfos.size(); i++) { 432 ClientInfo clientInfo = clientInfos.get(i); 433 ProcessIdentifier processIdentifier = new ProcessIdentifier(); 434 processIdentifier.pid = clientInfo.pid; 435 processIdentifier.startTimeMillis = clientInfo.startTimeMillis; 436 processIdentifiers.add(processIdentifier); 437 } 438 return processIdentifiers; 439 } 440 timeoutToString(int timeout)441 private String timeoutToString(int timeout) { 442 switch (timeout) { 443 case TIMEOUT_CRITICAL: 444 return "critical"; 445 case TIMEOUT_MODERATE: 446 return "moderate"; 447 case TIMEOUT_NORMAL: 448 return "normal"; 449 default: 450 Slogf.w(CarWatchdogService.TAG, "Unknown timeout value"); 451 return "unknown"; 452 } 453 } 454 getTimeoutDurationMs(int timeout)455 private long getTimeoutDurationMs(int timeout) { 456 if (mOverriddenClientHealthCheckWindowMs != MISSING_INT_PROPERTY_VALUE) { 457 return mOverriddenClientHealthCheckWindowMs; 458 } 459 switch (timeout) { 460 case TIMEOUT_CRITICAL: 461 return 3000L; 462 case TIMEOUT_MODERATE: 463 return 5000L; 464 case TIMEOUT_NORMAL: 465 return 10000L; 466 default: 467 Slogf.w(CarWatchdogService.TAG, "Unknown timeout value"); 468 return 10000L; 469 } 470 } 471 472 private final class ClientInfo implements IBinder.DeathRecipient { 473 public final ICarWatchdogServiceCallback client; 474 public final int pid; 475 public final long startTimeMillis; 476 @UserIdInt public final int userId; 477 public final int timeout; 478 public volatile int sessionId; 479 ClientInfo(ICarWatchdogServiceCallback client, int pid, @UserIdInt int userId, int timeout)480 ClientInfo(ICarWatchdogServiceCallback client, int pid, @UserIdInt int userId, 481 int timeout) { 482 this.client = client; 483 this.pid = pid; 484 // CarService doesn't have sepolicy access to read per-pid proc files, so it cannot 485 // fetch the pid's actual start time. When a client process registers with 486 // the CarService, it is safe to assume the process is still alive. So, populate 487 // elapsed real time and the consumer (CarServiceHelperService) of this data should 488 // verify that the actual start time is less than the reported start time. 489 this.startTimeMillis = SystemClock.elapsedRealtime(); 490 this.userId = userId; 491 this.timeout = timeout; 492 } 493 494 @Override binderDied()495 public void binderDied() { 496 Slogf.w(CarWatchdogService.TAG, "Client(pid: %d) died", pid); 497 onClientDeath(client, timeout); 498 } 499 linkToDeath()500 private void linkToDeath() throws RemoteException { 501 client.asBinder().linkToDeath(this, 0); 502 } 503 unlinkToDeath()504 private void unlinkToDeath() { 505 client.asBinder().unlinkToDeath(this, 0); 506 } 507 508 @Override toString()509 public String toString() { 510 return "ClientInfo{client=" + client + ", pid=" + pid + ", startTimeMillis=" 511 + startTimeMillis + ", userId=" + userId + ", timeout=" + timeout 512 + ", sessionId=" + sessionId + '}'; 513 } 514 } 515 } 516