• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2021 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.android.car.watchdog;
18 
19 import static android.car.watchdog.CarWatchdogManager.TIMEOUT_CRITICAL;
20 import static android.car.watchdog.CarWatchdogManager.TIMEOUT_MODERATE;
21 import static android.car.watchdog.CarWatchdogManager.TIMEOUT_NORMAL;
22 
23 import static com.android.car.CarServiceUtils.getHandlerThread;
24 import static com.android.car.internal.ExcludeFromCodeCoverageGeneratedReport.DUMP_INFO;
25 
26 import android.annotation.NonNull;
27 import android.annotation.UserIdInt;
28 import android.automotive.watchdog.internal.ICarWatchdogServiceForSystem;
29 import android.automotive.watchdog.internal.ProcessIdentifier;
30 import android.car.builtin.util.Slogf;
31 import android.car.watchdog.ICarWatchdogServiceCallback;
32 import android.car.watchdoglib.CarWatchdogDaemonHelper;
33 import android.os.Binder;
34 import android.os.Handler;
35 import android.os.IBinder;
36 import android.os.Looper;
37 import android.os.RemoteException;
38 import android.os.SystemClock;
39 import android.os.SystemProperties;
40 import android.util.SparseArray;
41 import android.util.SparseBooleanArray;
42 
43 import com.android.car.CarServiceHelperWrapper;
44 import com.android.car.internal.ExcludeFromCodeCoverageGeneratedReport;
45 import com.android.car.internal.util.IndentingPrintWriter;
46 import com.android.internal.annotations.GuardedBy;
47 
48 import java.util.ArrayList;
49 import java.util.List;
50 import java.util.Optional;
51 
52 /**
53  * Handles clients' health status checking and reporting the statuses to the watchdog daemon.
54  */
55 public final class WatchdogProcessHandler {
56     static final String PROPERTY_RO_CLIENT_HEALTHCHECK_INTERVAL =
57             "ro.carwatchdog.client_healthcheck.interval";
58     static final int MISSING_INT_PROPERTY_VALUE = -1;
59 
60     private static final int[] ALL_TIMEOUTS =
61             { TIMEOUT_CRITICAL, TIMEOUT_MODERATE, TIMEOUT_NORMAL };
62 
63     private final ICarWatchdogServiceForSystem mWatchdogServiceForSystem;
64     private final CarWatchdogDaemonHelper mCarWatchdogDaemonHelper;
65     private final Handler mMainHandler = new Handler(Looper.getMainLooper());
66     private final Handler mServiceHandler = new Handler(getHandlerThread(
67             CarWatchdogService.class.getSimpleName()).getLooper());
68     private final Object mLock = new Object();
69     /*
70      * Keeps the list of car watchdog client according to timeout:
71      * key => timeout, value => ClientInfo list.
72      * The value of SparseArray is guarded by mLock.
73      */
74     @GuardedBy("mLock")
75     private final SparseArray<ArrayList<ClientInfo>> mClientMap = new SparseArray<>();
76     /*
77      * Keeps the map of car watchdog client being checked by CarWatchdogService according to
78      * timeout: key => timeout, value => ClientInfo map.
79      * The value is also a map: key => session id, value => ClientInfo.
80      */
81     @GuardedBy("mLock")
82     private final SparseArray<SparseArray<ClientInfo>> mPingedClientMap = new SparseArray<>();
83     /*
84      * Keeps whether client health checking is being performed according to timeout:
85      * key => timeout, value => boolean (whether client health checking is being performed).
86      * The value of SparseArray is guarded by mLock.
87      */
88     @GuardedBy("mLock")
89     private final SparseArray<Boolean> mClientCheckInProgress = new SparseArray<>();
90     @GuardedBy("mLock")
91     private final ArrayList<ClientInfo> mClientsNotResponding = new ArrayList<>();
92     // mLastSessionId should only be accessed from the main thread.
93     @GuardedBy("mLock")
94     private int mLastSessionId;
95     @GuardedBy("mLock")
96     private final SparseBooleanArray mStoppedUser = new SparseBooleanArray();
97 
98     private long mOverriddenClientHealthCheckWindowMs = MISSING_INT_PROPERTY_VALUE;
99 
WatchdogProcessHandler(ICarWatchdogServiceForSystem serviceImpl, CarWatchdogDaemonHelper daemonHelper)100     public WatchdogProcessHandler(ICarWatchdogServiceForSystem serviceImpl,
101             CarWatchdogDaemonHelper daemonHelper) {
102         mWatchdogServiceForSystem = serviceImpl;
103         mCarWatchdogDaemonHelper = daemonHelper;
104     }
105 
106     /** Initializes the handler. */
init()107     public void init() {
108         synchronized (mLock) {
109             for (int timeout : ALL_TIMEOUTS) {
110                 mClientMap.put(timeout, new ArrayList<ClientInfo>());
111                 mPingedClientMap.put(timeout, new SparseArray<ClientInfo>());
112                 mClientCheckInProgress.put(timeout, false);
113             }
114         }
115         // Overridden timeout value must be greater than  or equal to the maximum possible timeout
116         // value. Otherwise, clients will be pinged more frequently than the guaranteed timeout
117         // duration.
118         int clientHealthCheckWindowSec = SystemProperties.getInt(
119                 PROPERTY_RO_CLIENT_HEALTHCHECK_INTERVAL, MISSING_INT_PROPERTY_VALUE);
120         if (clientHealthCheckWindowSec != MISSING_INT_PROPERTY_VALUE) {
121             mOverriddenClientHealthCheckWindowMs = Math.max(clientHealthCheckWindowSec * 1000L,
122                     getTimeoutDurationMs(TIMEOUT_NORMAL));
123         }
124         if (CarWatchdogService.DEBUG) {
125             Slogf.d(CarWatchdogService.TAG, "WatchdogProcessHandler is initialized");
126         }
127     }
128 
129     /** Dumps its state. */
130     @ExcludeFromCodeCoverageGeneratedReport(reason = DUMP_INFO)
dump(IndentingPrintWriter writer)131     public void dump(IndentingPrintWriter writer) {
132         synchronized (mLock) {
133             writer.println("Registered clients");
134             writer.increaseIndent();
135             int count = 1;
136             for (int timeout : ALL_TIMEOUTS) {
137                 ArrayList<ClientInfo> clients = mClientMap.get(timeout);
138                 String timeoutStr = timeoutToString(timeout);
139                 for (ClientInfo clientInfo : clients) {
140                     writer.printf("client #%d: timeout = %s, pid = %d\n", count++, timeoutStr,
141                             clientInfo.pid);
142                 }
143             }
144             writer.printf("Stopped users: ");
145             int size = mStoppedUser.size();
146             if (size > 0) {
147                 writer.printf("%d", mStoppedUser.keyAt(0));
148                 for (int i = 1; i < size; i++) {
149                     writer.printf(", %d", mStoppedUser.keyAt(i));
150                 }
151                 writer.println();
152             } else {
153                 writer.println("none");
154             }
155             writer.decreaseIndent();
156         }
157     }
158 
159     /** Registers the client callback */
registerClient(ICarWatchdogServiceCallback client, int timeout)160     public void registerClient(ICarWatchdogServiceCallback client, int timeout) {
161         synchronized (mLock) {
162             ArrayList<ClientInfo> clients = mClientMap.get(timeout);
163             if (clients == null) {
164                 Slogf.w(CarWatchdogService.TAG, "Cannot register the client: invalid timeout");
165                 return;
166             }
167             IBinder binder = client.asBinder();
168             for (int i = 0; i < clients.size(); i++) {
169                 ClientInfo clientInfo = clients.get(i);
170                 if (binder == clientInfo.client.asBinder()) {
171                     Slogf.w(CarWatchdogService.TAG,
172                             "Cannot register the client: the client(pid: %d) has been already "
173                             + "registered", clientInfo.pid);
174                     return;
175                 }
176             }
177             int pid = Binder.getCallingPid();
178             int userId = Binder.getCallingUserHandle().getIdentifier();
179             ClientInfo clientInfo = new ClientInfo(client, pid, userId, timeout);
180             try {
181                 clientInfo.linkToDeath();
182             } catch (RemoteException e) {
183                 Slogf.w(CarWatchdogService.TAG,
184                         "Cannot register the client: linkToDeath to the client failed");
185                 return;
186             }
187             clients.add(clientInfo);
188             if (CarWatchdogService.DEBUG) {
189                 Slogf.d(CarWatchdogService.TAG, "Registered client: %s", clientInfo);
190             }
191         }
192     }
193 
194     /** Unregisters the previously registered client callback */
unregisterClient(ICarWatchdogServiceCallback client)195     public void unregisterClient(ICarWatchdogServiceCallback client) {
196         ClientInfo clientInfo;
197         synchronized (mLock) {
198             IBinder binder = client.asBinder();
199             // Even if a client did not respond to the latest ping, CarWatchdogService should honor
200             // the unregister request at this point and remove it from all internal caches.
201             // Otherwise, the client might be killed even after unregistering.
202             Optional<ClientInfo> optionalClientInfo = removeFromClientMapsLocked(binder);
203             if (optionalClientInfo.isEmpty()) {
204                 Slogf.w(CarWatchdogService.TAG,
205                         "Cannot unregister the client: the client has not been registered before");
206                 return;
207             }
208             clientInfo = optionalClientInfo.get();
209             for (int i = 0; i < mClientsNotResponding.size(); i++) {
210                 ClientInfo notRespondingClientInfo = mClientsNotResponding.get(i);
211                 if (binder == notRespondingClientInfo.client.asBinder()) {
212                     mClientsNotResponding.remove(i);
213                     break;
214                 }
215             }
216         }
217         if (CarWatchdogService.DEBUG) {
218             Slogf.d(CarWatchdogService.TAG, "Unregistered client: %s", clientInfo);
219         }
220     }
221 
222     @GuardedBy("mLock")
removeFromClientMapsLocked(IBinder binder)223     private Optional<ClientInfo> removeFromClientMapsLocked(IBinder binder) {
224         for (int timeout : ALL_TIMEOUTS) {
225             ArrayList<ClientInfo> clients = mClientMap.get(timeout);
226             for (int i = 0; i < clients.size(); i++) {
227                 ClientInfo clientInfo = clients.get(i);
228                 if (binder != clientInfo.client.asBinder()) {
229                     continue;
230                 }
231                 clientInfo.unlinkToDeath();
232                 clients.remove(i);
233                 SparseArray<ClientInfo> pingedClients = mPingedClientMap.get(timeout);
234                 if (pingedClients != null) {
235                     pingedClients.remove(clientInfo.sessionId);
236                 }
237                 return Optional.of(clientInfo);
238             }
239         }
240         return Optional.empty();
241     }
242 
243     /** Tells the handler that the client is alive. */
tellClientAlive(ICarWatchdogServiceCallback client, int sessionId)244     public void tellClientAlive(ICarWatchdogServiceCallback client, int sessionId) {
245         synchronized (mLock) {
246             for (int timeout : ALL_TIMEOUTS) {
247                 if (!mClientCheckInProgress.get(timeout)) {
248                     continue;
249                 }
250                 SparseArray<ClientInfo> pingedClients = mPingedClientMap.get(timeout);
251                 ClientInfo clientInfo = pingedClients.get(sessionId);
252                 if (clientInfo != null && clientInfo.client.asBinder() == client.asBinder()) {
253                     pingedClients.remove(sessionId);
254                     return;
255                 }
256             }
257         }
258     }
259 
260     /** Updates the user stopped state */
updateUserState(@serIdInt int userId, boolean isStopped)261     public void updateUserState(@UserIdInt int userId, boolean isStopped) {
262         synchronized (mLock) {
263             if (isStopped) {
264                 mStoppedUser.put(userId, true);
265             } else {
266                 mStoppedUser.delete(userId);
267             }
268         }
269     }
270 
271     /** Posts health check message */
postHealthCheckMessage(int sessionId)272     public void postHealthCheckMessage(int sessionId) {
273         mMainHandler.postAtFrontOfQueue(() -> doHealthCheck(sessionId));
274     }
275 
276     /** Returns the registered and alive client count. */
getClientCount(int timeout)277     public int getClientCount(int timeout) {
278         synchronized (mLock) {
279             ArrayList<ClientInfo> clients = mClientMap.get(timeout);
280             return clients != null ? clients.size() : 0;
281         }
282     }
283 
284     /** Resets pinged clients before health checking */
prepareHealthCheck()285     public void prepareHealthCheck() {
286         synchronized (mLock) {
287             for (int timeout : ALL_TIMEOUTS) {
288                 SparseArray<ClientInfo> pingedClients = mPingedClientMap.get(timeout);
289                 pingedClients.clear();
290             }
291         }
292     }
293 
294     /**
295      * Asynchronously fetches the AIDL VHAL pid from SystemServer.
296      *
297      * On fetching the AIDL VHAL pid, car watchdog daemon is updated via an async callback.
298      */
asyncFetchAidlVhalPid()299     public void asyncFetchAidlVhalPid() {
300         mServiceHandler.post(() -> {
301             int pid = CarServiceHelperWrapper.getInstance().fetchAidlVhalPid();
302             if (pid < 0) {
303                 Slogf.e(CarWatchdogService.TAG, "Failed to fetch AIDL VHAL pid from"
304                         + " CarServiceHelperService");
305                 return;
306             }
307             try {
308                 mCarWatchdogDaemonHelper.onAidlVhalPidFetched(pid);
309             } catch (RemoteException e) {
310                 Slogf.e(CarWatchdogService.TAG,
311                         "Failed to notify car watchdog daemon of the AIDL VHAL pid");
312             }
313         });
314     }
315 
316     /** Enables/disables the watchdog daemon client health check process. */
controlProcessHealthCheck(boolean enable)317     void controlProcessHealthCheck(boolean enable) {
318         try {
319             mCarWatchdogDaemonHelper.controlProcessHealthCheck(enable);
320         } catch (RemoteException e) {
321             Slogf.w(CarWatchdogService.TAG,
322                     "Cannot enable/disable the car watchdog daemon health check process: %s", e);
323         }
324     }
325 
onClientDeath(ICarWatchdogServiceCallback client, int timeout)326     private void onClientDeath(ICarWatchdogServiceCallback client, int timeout) {
327         synchronized (mLock) {
328             removeClientLocked(client.asBinder(), timeout);
329         }
330     }
331 
doHealthCheck(int sessionId)332     private void doHealthCheck(int sessionId) {
333         // For critical clients, the response status are checked just before reporting to car
334         // watchdog daemon. For moderate and normal clients, the status are checked after allowed
335         // delay per timeout.
336         analyzeClientResponse(TIMEOUT_CRITICAL);
337         reportHealthCheckResult(sessionId);
338         sendPingToClients(TIMEOUT_CRITICAL);
339         sendPingToClientsAndCheck(TIMEOUT_MODERATE);
340         sendPingToClientsAndCheck(TIMEOUT_NORMAL);
341     }
342 
analyzeClientResponse(int timeout)343     private void analyzeClientResponse(int timeout) {
344         // Clients which are not responding are stored in mClientsNotResponding, and will be dumped
345         // and killed at the next response of CarWatchdogService to car watchdog daemon.
346         synchronized (mLock) {
347             SparseArray<ClientInfo> pingedClients = mPingedClientMap.get(timeout);
348             for (int i = 0; i < pingedClients.size(); i++) {
349                 ClientInfo clientInfo = pingedClients.valueAt(i);
350                 if (mStoppedUser.get(clientInfo.userId)) {
351                     continue;
352                 }
353                 mClientsNotResponding.add(clientInfo);
354                 removeClientLocked(clientInfo.client.asBinder(), timeout);
355             }
356             mClientCheckInProgress.setValueAt(timeout, false);
357         }
358     }
359 
sendPingToClients(int timeout)360     private void sendPingToClients(int timeout) {
361         ArrayList<ClientInfo> clientsToCheck;
362         synchronized (mLock) {
363             SparseArray<ClientInfo> pingedClients = mPingedClientMap.get(timeout);
364             pingedClients.clear();
365             clientsToCheck = new ArrayList<>(mClientMap.get(timeout));
366             for (int i = 0; i < clientsToCheck.size(); i++) {
367                 ClientInfo clientInfo = clientsToCheck.get(i);
368                 if (mStoppedUser.get(clientInfo.userId)) {
369                     continue;
370                 }
371                 int sessionId = getNewSessionId();
372                 clientInfo.sessionId = sessionId;
373                 pingedClients.put(sessionId, clientInfo);
374             }
375             mClientCheckInProgress.setValueAt(timeout, true);
376         }
377 
378         for (int i = 0; i < clientsToCheck.size(); i++) {
379             ClientInfo clientInfo = clientsToCheck.get(i);
380             try {
381                 clientInfo.client.onCheckHealthStatus(clientInfo.sessionId, timeout);
382             } catch (RemoteException e) {
383                 Slogf.w(CarWatchdogService.TAG,
384                         "Sending a ping message to client(pid: %d) failed: %s",
385                         clientInfo.pid, e);
386                 synchronized (mLock) {
387                     mPingedClientMap.get(timeout).remove(clientInfo.sessionId);
388                 }
389             }
390         }
391     }
392 
sendPingToClientsAndCheck(int timeout)393     private void sendPingToClientsAndCheck(int timeout) {
394         synchronized (mLock) {
395             if (mClientCheckInProgress.get(timeout)) {
396                 return;
397             }
398         }
399         sendPingToClients(timeout);
400         mMainHandler.postDelayed(
401                 () -> analyzeClientResponse(timeout), getTimeoutDurationMs(timeout));
402     }
403 
getNewSessionId()404     private int getNewSessionId() {
405         synchronized (mLock) {
406             if (++mLastSessionId <= 0) {
407                 mLastSessionId = 1;
408             }
409             return mLastSessionId;
410         }
411     }
412 
413     @GuardedBy("mLock")
removeClientLocked(IBinder clientBinder, int timeout)414     private void removeClientLocked(IBinder clientBinder, int timeout) {
415         ArrayList<ClientInfo> clients = mClientMap.get(timeout);
416         for (int i = 0; i < clients.size(); i++) {
417             ClientInfo clientInfo = clients.get(i);
418             if (clientBinder == clientInfo.client.asBinder()) {
419                 clients.remove(i);
420                 return;
421             }
422         }
423     }
424 
reportHealthCheckResult(int sessionId)425     private void reportHealthCheckResult(int sessionId) {
426         List<ProcessIdentifier> clientsNotResponding;
427         ArrayList<ClientInfo> clientsToNotify;
428         synchronized (mLock) {
429             clientsNotResponding = toProcessIdentifierList(mClientsNotResponding);
430             clientsToNotify = new ArrayList<>(mClientsNotResponding);
431             mClientsNotResponding.clear();
432         }
433         for (int i = 0; i < clientsToNotify.size(); i++) {
434             ClientInfo clientInfo = clientsToNotify.get(i);
435             try {
436                 clientInfo.client.onPrepareProcessTermination();
437             } catch (RemoteException e) {
438                 Slogf.w(CarWatchdogService.TAG,
439                         "Notifying onPrepareProcessTermination to client(pid: %d) failed: %s",
440                         clientInfo.pid, e);
441             }
442         }
443 
444         try {
445             mCarWatchdogDaemonHelper.tellCarWatchdogServiceAlive(
446                     mWatchdogServiceForSystem, clientsNotResponding, sessionId);
447         } catch (RemoteException | RuntimeException e) {
448             Slogf.w(CarWatchdogService.TAG,
449                     "Cannot respond to car watchdog daemon (sessionId=%d): %s", sessionId, e);
450         }
451     }
452 
453     @NonNull
toProcessIdentifierList( @onNull ArrayList<ClientInfo> clientInfos)454     private List<ProcessIdentifier> toProcessIdentifierList(
455             @NonNull ArrayList<ClientInfo> clientInfos) {
456         List<ProcessIdentifier> processIdentifiers = new ArrayList<>(clientInfos.size());
457         for (int i = 0; i < clientInfos.size(); i++) {
458             ClientInfo clientInfo = clientInfos.get(i);
459             ProcessIdentifier processIdentifier = new ProcessIdentifier();
460             processIdentifier.pid = clientInfo.pid;
461             processIdentifier.startTimeMillis = clientInfo.startTimeMillis;
462             processIdentifiers.add(processIdentifier);
463         }
464         return processIdentifiers;
465     }
466 
timeoutToString(int timeout)467     private String timeoutToString(int timeout) {
468         switch (timeout) {
469             case TIMEOUT_CRITICAL:
470                 return "critical";
471             case TIMEOUT_MODERATE:
472                 return "moderate";
473             case TIMEOUT_NORMAL:
474                 return "normal";
475             default:
476                 Slogf.w(CarWatchdogService.TAG, "Unknown timeout value");
477                 return "unknown";
478         }
479     }
480 
getTimeoutDurationMs(int timeout)481     private long getTimeoutDurationMs(int timeout) {
482         if (mOverriddenClientHealthCheckWindowMs != MISSING_INT_PROPERTY_VALUE) {
483             return mOverriddenClientHealthCheckWindowMs;
484         }
485         switch (timeout) {
486             case TIMEOUT_CRITICAL:
487                 return 3000L;
488             case TIMEOUT_MODERATE:
489                 return 5000L;
490             case TIMEOUT_NORMAL:
491                 return 10000L;
492             default:
493                 Slogf.w(CarWatchdogService.TAG, "Unknown timeout value");
494                 return 10000L;
495         }
496     }
497 
498     private final class ClientInfo implements IBinder.DeathRecipient {
499         public final ICarWatchdogServiceCallback client;
500         public final int pid;
501         public final long startTimeMillis;
502         @UserIdInt public final int userId;
503         public final int timeout;
504         public volatile int sessionId;
505 
ClientInfo(ICarWatchdogServiceCallback client, int pid, @UserIdInt int userId, int timeout)506         ClientInfo(ICarWatchdogServiceCallback client, int pid, @UserIdInt int userId,
507                 int timeout) {
508             this.client = client;
509             this.pid = pid;
510             // CarService doesn't have sepolicy access to read per-pid proc files, so it cannot
511             // fetch the pid's actual start time. When a client process registers with
512             // the CarService, it is safe to assume the process is still alive. So, populate
513             // elapsed real time and the consumer (CarServiceHelperService) of this data should
514             // verify that the actual start time is less than the reported start time.
515             this.startTimeMillis = SystemClock.elapsedRealtime();
516             this.userId = userId;
517             this.timeout = timeout;
518         }
519 
520         @Override
binderDied()521         public void binderDied() {
522             Slogf.w(CarWatchdogService.TAG, "Client(pid: %d) died", pid);
523             onClientDeath(client, timeout);
524         }
525 
linkToDeath()526         private void linkToDeath() throws RemoteException {
527             client.asBinder().linkToDeath(this, 0);
528         }
529 
unlinkToDeath()530         private void unlinkToDeath() {
531             client.asBinder().unlinkToDeath(this, 0);
532         }
533 
534         @Override
toString()535         public String toString() {
536             return "ClientInfo{client=" + client + ", pid=" + pid + ", startTimeMillis="
537                     + startTimeMillis + ", userId=" + userId + ", timeout=" + timeout
538                     + ", sessionId=" + sessionId + '}';
539         }
540     }
541 }
542