• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2008 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.android.server;
18 
19 import static com.android.server.Watchdog.HandlerCheckerAndTimeout.withCustomTimeout;
20 import static com.android.server.Watchdog.HandlerCheckerAndTimeout.withDefaultTimeout;
21 
22 import android.annotation.NonNull;
23 import android.annotation.Nullable;
24 import android.app.IActivityController;
25 import android.content.BroadcastReceiver;
26 import android.content.Context;
27 import android.content.Intent;
28 import android.content.IntentFilter;
29 import android.database.ContentObserver;
30 import android.hidl.manager.V1_0.IServiceManager;
31 import android.net.Uri;
32 import android.os.Binder;
33 import android.os.Build;
34 import android.os.Debug;
35 import android.os.FileUtils;
36 import android.os.Handler;
37 import android.os.IPowerManager;
38 import android.os.Looper;
39 import android.os.Process;
40 import android.os.RemoteException;
41 import android.os.ServiceDebugInfo;
42 import android.os.ServiceManager;
43 import android.os.SystemClock;
44 import android.os.SystemProperties;
45 import android.os.UserHandle;
46 import android.provider.Settings;
47 import android.sysprop.WatchdogProperties;
48 import android.util.Dumpable;
49 import android.util.EventLog;
50 import android.util.Log;
51 import android.util.Slog;
52 import android.util.SparseArray;
53 
54 import com.android.internal.os.BackgroundThread;
55 import com.android.internal.os.ProcessCpuTracker;
56 import com.android.internal.os.ZygoteConnectionConstants;
57 import com.android.internal.util.FrameworkStatsLog;
58 import com.android.server.am.ActivityManagerService;
59 import com.android.server.am.TraceErrorLogger;
60 import com.android.server.criticalevents.CriticalEventLog;
61 import com.android.server.wm.SurfaceAnimationThread;
62 
63 import java.io.BufferedReader;
64 import java.io.File;
65 import java.io.FileNotFoundException;
66 import java.io.FileReader;
67 import java.io.FileWriter;
68 import java.io.IOException;
69 import java.io.PrintWriter;
70 import java.io.StringWriter;
71 import java.util.ArrayList;
72 import java.util.Arrays;
73 import java.util.Collections;
74 import java.util.HashSet;
75 import java.util.List;
76 import java.util.Optional;
77 import java.util.UUID;
78 import java.util.concurrent.TimeUnit;
79 
80 /**
81  * This class calls its monitor every minute. Killing this process if they don't return
82  **/
83 public class Watchdog implements Dumpable {
84     static final String TAG = "Watchdog";
85 
86     /** Debug flag. */
87     public static final boolean DEBUG = false;
88 
89     // Set this to true to use debug default values.
90     private static final boolean DB = false;
91 
92     // Note 1: Do not lower this value below thirty seconds without tightening the invoke-with
93     //         timeout in com.android.internal.os.ZygoteConnection, or wrapped applications
94     //         can trigger the watchdog.
95     // Note 2: The debug value is already below the wait time in ZygoteConnection. Wrapped
96     //         applications may not work with a debug build. CTS will fail.
97     private static final long DEFAULT_TIMEOUT = DB ? 10 * 1000 : 60 * 1000;
98 
99     // These are temporally ordered: larger values as lateness increases
100     private static final int COMPLETED = 0;
101     private static final int WAITING = 1;
102     private static final int WAITED_HALF = 2;
103     private static final int OVERDUE = 3;
104 
105     // Track watchdog timeout history and break the crash loop if there is.
106     private static final String TIMEOUT_HISTORY_FILE = "/data/system/watchdog-timeout-history.txt";
107     private static final String PROP_FATAL_LOOP_COUNT = "framework_watchdog.fatal_count";
108     private static final String PROP_FATAL_LOOP_WINDOWS_SECS =
109             "framework_watchdog.fatal_window.second";
110 
111     // Which native processes to dump into dropbox's stack traces
112     public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] {
113         "/system/bin/audioserver",
114         "/system/bin/cameraserver",
115         "/system/bin/drmserver",
116         "/system/bin/keystore2",
117         "/system/bin/mediadrmserver",
118         "/system/bin/mediaserver",
119         "/system/bin/netd",
120         "/system/bin/sdcard",
121         "/system/bin/surfaceflinger",
122         "/system/bin/vold",
123         "media.extractor", // system/bin/mediaextractor
124         "media.metrics", // system/bin/mediametrics
125         "media.codec", // vendor/bin/hw/android.hardware.media.omx@1.0-service
126         "media.swcodec", // /apex/com.android.media.swcodec/bin/mediaswcodec
127         "media.transcoding", // Media transcoding service
128         "com.android.bluetooth",  // Bluetooth service
129         "/apex/com.android.os.statsd/bin/statsd",  // Stats daemon
130     };
131 
132     public static final List<String> HAL_INTERFACES_OF_INTEREST = Arrays.asList(
133             "android.hardware.audio@4.0::IDevicesFactory",
134             "android.hardware.audio@5.0::IDevicesFactory",
135             "android.hardware.audio@6.0::IDevicesFactory",
136             "android.hardware.audio@7.0::IDevicesFactory",
137             "android.hardware.biometrics.face@1.0::IBiometricsFace",
138             "android.hardware.biometrics.fingerprint@2.1::IBiometricsFingerprint",
139             "android.hardware.bluetooth@1.0::IBluetoothHci",
140             "android.hardware.camera.provider@2.4::ICameraProvider",
141             "android.hardware.gnss@1.0::IGnss",
142             "android.hardware.graphics.allocator@2.0::IAllocator",
143             "android.hardware.graphics.composer@2.1::IComposer",
144             "android.hardware.health@2.0::IHealth",
145             "android.hardware.light@2.0::ILight",
146             "android.hardware.media.c2@1.0::IComponentStore",
147             "android.hardware.media.omx@1.0::IOmx",
148             "android.hardware.media.omx@1.0::IOmxStore",
149             "android.hardware.neuralnetworks@1.0::IDevice",
150             "android.hardware.power@1.0::IPower",
151             "android.hardware.power.stats@1.0::IPowerStats",
152             "android.hardware.sensors@1.0::ISensors",
153             "android.hardware.sensors@2.0::ISensors",
154             "android.hardware.sensors@2.1::ISensors",
155             "android.hardware.vibrator@1.0::IVibrator",
156             "android.hardware.vr@1.0::IVr",
157             "android.system.suspend@1.0::ISystemSuspend"
158     );
159 
160     public static final String[] AIDL_INTERFACE_PREFIXES_OF_INTEREST = new String[] {
161             "android.hardware.biometrics.face.IFace/",
162             "android.hardware.biometrics.fingerprint.IFingerprint/",
163             "android.hardware.graphics.composer3.IComposer/",
164             "android.hardware.input.processor.IInputProcessor/",
165             "android.hardware.light.ILights/",
166             "android.hardware.power.IPower/",
167             "android.hardware.power.stats.IPowerStats/",
168             "android.hardware.vibrator.IVibrator/",
169             "android.hardware.vibrator.IVibratorManager/"
170     };
171 
172     private static Watchdog sWatchdog;
173 
174     private final Thread mThread;
175 
176     private final Object mLock = new Object();
177 
178     /* This handler will be used to post message back onto the main thread */
179     private final ArrayList<HandlerCheckerAndTimeout> mHandlerCheckers = new ArrayList<>();
180     private final HandlerChecker mMonitorChecker;
181     private ActivityManagerService mActivity;
182     private IActivityController mController;
183     private boolean mAllowRestart = true;
184     // We start with DEFAULT_TIMEOUT. This will then be update with the timeout values from Settings
185     // once the settings provider is initialized.
186     private volatile long mWatchdogTimeoutMillis = DEFAULT_TIMEOUT;
187     private final List<Integer> mInterestingJavaPids = new ArrayList<>();
188     private final TraceErrorLogger mTraceErrorLogger;
189 
190     /** Holds a checker and its timeout. */
191     static final class HandlerCheckerAndTimeout {
192         private final HandlerChecker mHandler;
193         private final Optional<Long> mCustomTimeoutMillis;
194 
HandlerCheckerAndTimeout(HandlerChecker checker, Optional<Long> timeoutMillis)195         private HandlerCheckerAndTimeout(HandlerChecker checker, Optional<Long> timeoutMillis) {
196             this.mHandler = checker;
197             this.mCustomTimeoutMillis = timeoutMillis;
198         }
199 
checker()200         HandlerChecker checker() {
201             return mHandler;
202         }
203 
204         /** Returns the timeout. */
customTimeoutMillis()205         Optional<Long> customTimeoutMillis() {
206             return mCustomTimeoutMillis;
207         }
208 
209         /**
210          * Creates a checker with the default timeout. The timeout will use the default value which
211          * is configurable server-side.
212          */
withDefaultTimeout(HandlerChecker checker)213         static HandlerCheckerAndTimeout withDefaultTimeout(HandlerChecker checker) {
214             return new HandlerCheckerAndTimeout(checker, Optional.empty());
215         }
216 
217         /**
218          * Creates a checker with a custom timeout. The timeout overrides the default value and will
219          * always be used.
220          */
withCustomTimeout( HandlerChecker checker, long timeoutMillis)221         static HandlerCheckerAndTimeout withCustomTimeout(
222                 HandlerChecker checker, long timeoutMillis) {
223             return new HandlerCheckerAndTimeout(checker, Optional.of(timeoutMillis));
224         }
225     }
226 
227     /**
228      * Used for checking status of handle threads and scheduling monitor callbacks.
229      */
230     public final class HandlerChecker implements Runnable {
231         private final Handler mHandler;
232         private final String mName;
233         private final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>();
234         private final ArrayList<Monitor> mMonitorQueue = new ArrayList<Monitor>();
235         private long mWaitMax;
236         private boolean mCompleted;
237         private Monitor mCurrentMonitor;
238         private long mStartTime;
239         private int mPauseCount;
240 
HandlerChecker(Handler handler, String name)241         HandlerChecker(Handler handler, String name) {
242             mHandler = handler;
243             mName = name;
244             mCompleted = true;
245         }
246 
addMonitorLocked(Monitor monitor)247         void addMonitorLocked(Monitor monitor) {
248             // We don't want to update mMonitors when the Handler is in the middle of checking
249             // all monitors. We will update mMonitors on the next schedule if it is safe
250             mMonitorQueue.add(monitor);
251         }
252 
253         /**
254          * Schedules a run on the handler thread.
255          *
256          * @param handlerCheckerTimeoutMillis the timeout to use for this run
257          */
scheduleCheckLocked(long handlerCheckerTimeoutMillis)258         public void scheduleCheckLocked(long handlerCheckerTimeoutMillis) {
259             mWaitMax = handlerCheckerTimeoutMillis;
260             if (mCompleted) {
261                 // Safe to update monitors in queue, Handler is not in the middle of work
262                 mMonitors.addAll(mMonitorQueue);
263                 mMonitorQueue.clear();
264             }
265             if ((mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling())
266                     || (mPauseCount > 0)) {
267                 // Don't schedule until after resume OR
268                 // If the target looper has recently been polling, then
269                 // there is no reason to enqueue our checker on it since that
270                 // is as good as it not being deadlocked.  This avoid having
271                 // to do a context switch to check the thread. Note that we
272                 // only do this if we have no monitors since those would need to
273                 // be executed at this point.
274                 mCompleted = true;
275                 return;
276             }
277             if (!mCompleted) {
278                 // we already have a check in flight, so no need
279                 return;
280             }
281 
282             mCompleted = false;
283             mCurrentMonitor = null;
284             mStartTime = SystemClock.uptimeMillis();
285             mHandler.postAtFrontOfQueue(this);
286         }
287 
getCompletionStateLocked()288         public int getCompletionStateLocked() {
289             if (mCompleted) {
290                 return COMPLETED;
291             } else {
292                 long latency = SystemClock.uptimeMillis() - mStartTime;
293                 if (latency < mWaitMax/2) {
294                     return WAITING;
295                 } else if (latency < mWaitMax) {
296                     return WAITED_HALF;
297                 }
298             }
299             return OVERDUE;
300         }
301 
getThread()302         public Thread getThread() {
303             return mHandler.getLooper().getThread();
304         }
305 
getName()306         public String getName() {
307             return mName;
308         }
309 
describeBlockedStateLocked()310         String describeBlockedStateLocked() {
311             if (mCurrentMonitor == null) {
312                 return "Blocked in handler on " + mName + " (" + getThread().getName() + ")";
313             } else {
314                 return "Blocked in monitor " + mCurrentMonitor.getClass().getName()
315                         + " on " + mName + " (" + getThread().getName() + ")";
316             }
317         }
318 
319         @Override
run()320         public void run() {
321             // Once we get here, we ensure that mMonitors does not change even if we call
322             // #addMonitorLocked because we first add the new monitors to mMonitorQueue and
323             // move them to mMonitors on the next schedule when mCompleted is true, at which
324             // point we have completed execution of this method.
325             final int size = mMonitors.size();
326             for (int i = 0 ; i < size ; i++) {
327                 synchronized (mLock) {
328                     mCurrentMonitor = mMonitors.get(i);
329                 }
330                 mCurrentMonitor.monitor();
331             }
332 
333             synchronized (mLock) {
334                 mCompleted = true;
335                 mCurrentMonitor = null;
336             }
337         }
338 
339         /** Pause the HandlerChecker. */
pauseLocked(String reason)340         public void pauseLocked(String reason) {
341             mPauseCount++;
342             // Mark as completed, because there's a chance we called this after the watchog
343             // thread loop called Object#wait after 'WAITED_HALF'. In that case we want to ensure
344             // the next call to #getCompletionStateLocked for this checker returns 'COMPLETED'
345             mCompleted = true;
346             Slog.i(TAG, "Pausing HandlerChecker: " + mName + " for reason: "
347                     + reason + ". Pause count: " + mPauseCount);
348         }
349 
350         /** Resume the HandlerChecker from the last {@link #pauseLocked}. */
resumeLocked(String reason)351         public void resumeLocked(String reason) {
352             if (mPauseCount > 0) {
353                 mPauseCount--;
354                 Slog.i(TAG, "Resuming HandlerChecker: " + mName + " for reason: "
355                         + reason + ". Pause count: " + mPauseCount);
356             } else {
357                 Slog.wtf(TAG, "Already resumed HandlerChecker: " + mName);
358             }
359         }
360     }
361 
362     final class RebootRequestReceiver extends BroadcastReceiver {
363         @Override
onReceive(Context c, Intent intent)364         public void onReceive(Context c, Intent intent) {
365             if (intent.getIntExtra("nowait", 0) != 0) {
366                 rebootSystem("Received ACTION_REBOOT broadcast");
367                 return;
368             }
369             Slog.w(TAG, "Unsupported ACTION_REBOOT broadcast: " + intent);
370         }
371     }
372 
373     /** Monitor for checking the availability of binder threads. The monitor will block until
374      * there is a binder thread available to process in coming IPCs to make sure other processes
375      * can still communicate with the service.
376      */
377     private static final class BinderThreadMonitor implements Watchdog.Monitor {
378         @Override
monitor()379         public void monitor() {
380             Binder.blockUntilThreadAvailable();
381         }
382     }
383 
384     public interface Monitor {
monitor()385         void monitor();
386     }
387 
getInstance()388     public static Watchdog getInstance() {
389         if (sWatchdog == null) {
390             sWatchdog = new Watchdog();
391         }
392 
393         return sWatchdog;
394     }
395 
Watchdog()396     private Watchdog() {
397         mThread = new Thread(this::run, "watchdog");
398 
399         // Initialize handler checkers for each common thread we want to check.  Note
400         // that we are not currently checking the background thread, since it can
401         // potentially hold longer running operations with no guarantees about the timeliness
402         // of operations there.
403         //
404         // The shared foreground thread is the main checker.  It is where we
405         // will also dispatch monitor checks and do other work.
406         mMonitorChecker = new HandlerChecker(FgThread.getHandler(),
407                 "foreground thread");
408         mHandlerCheckers.add(withDefaultTimeout(mMonitorChecker));
409         // Add checker for main thread.  We only do a quick check since there
410         // can be UI running on the thread.
411         mHandlerCheckers.add(withDefaultTimeout(
412                 new HandlerChecker(new Handler(Looper.getMainLooper()), "main thread")));
413         // Add checker for shared UI thread.
414         mHandlerCheckers.add(withDefaultTimeout(
415                 new HandlerChecker(UiThread.getHandler(), "ui thread")));
416         // And also check IO thread.
417         mHandlerCheckers.add(withDefaultTimeout(
418                 new HandlerChecker(IoThread.getHandler(), "i/o thread")));
419         // And the display thread.
420         mHandlerCheckers.add(withDefaultTimeout(
421                 new HandlerChecker(DisplayThread.getHandler(), "display thread")));
422         // And the animation thread.
423         mHandlerCheckers.add(withDefaultTimeout(
424                  new HandlerChecker(AnimationThread.getHandler(), "animation thread")));
425         // And the surface animation thread.
426         mHandlerCheckers.add(withDefaultTimeout(
427                 new HandlerChecker(SurfaceAnimationThread.getHandler(),
428                     "surface animation thread")));
429         // Initialize monitor for Binder threads.
430         addMonitor(new BinderThreadMonitor());
431 
432         mInterestingJavaPids.add(Process.myPid());
433 
434         // See the notes on DEFAULT_TIMEOUT.
435         assert DB ||
436                 DEFAULT_TIMEOUT > ZygoteConnectionConstants.WRAPPED_PID_TIMEOUT_MILLIS;
437 
438         mTraceErrorLogger = new TraceErrorLogger();
439     }
440 
441     /**
442      * Called by SystemServer to cause the internal thread to begin execution.
443      */
start()444     public void start() {
445         mThread.start();
446     }
447 
448     /**
449      * Registers a {@link BroadcastReceiver} to listen to reboot broadcasts and trigger reboot.
450      * Should be called during boot after the ActivityManagerService is up and registered
451      * as a system service so it can handle registration of a {@link BroadcastReceiver}.
452      */
init(Context context, ActivityManagerService activity)453     public void init(Context context, ActivityManagerService activity) {
454         mActivity = activity;
455         context.registerReceiver(new RebootRequestReceiver(),
456                 new IntentFilter(Intent.ACTION_REBOOT),
457                 android.Manifest.permission.REBOOT, null);
458     }
459 
460     private static class SettingsObserver extends ContentObserver {
461         private final Uri mUri = Settings.Global.getUriFor(Settings.Global.WATCHDOG_TIMEOUT_MILLIS);
462         private final Context mContext;
463         private final Watchdog mWatchdog;
464 
SettingsObserver(Context context, Watchdog watchdog)465         SettingsObserver(Context context, Watchdog watchdog) {
466             super(BackgroundThread.getHandler());
467             mContext = context;
468             mWatchdog = watchdog;
469             // Always kick once to ensure that we match current state
470             onChange();
471         }
472 
473         @Override
onChange(boolean selfChange, Uri uri, int userId)474         public void onChange(boolean selfChange, Uri uri, int userId) {
475             if (mUri.equals(uri)) {
476                 onChange();
477             }
478         }
479 
onChange()480         public void onChange() {
481             try {
482                 mWatchdog.updateWatchdogTimeout(Settings.Global.getLong(
483                         mContext.getContentResolver(),
484                         Settings.Global.WATCHDOG_TIMEOUT_MILLIS, DEFAULT_TIMEOUT));
485             } catch (RuntimeException e) {
486                 Slog.e(TAG, "Exception while reading settings " + e.getMessage(), e);
487             }
488         }
489     }
490 
491     /**
492      * Register an observer to listen to settings.
493      *
494      * It needs to be called after the settings service is initialized.
495      */
registerSettingsObserver(Context context)496     public void registerSettingsObserver(Context context) {
497         context.getContentResolver().registerContentObserver(
498                 Settings.Global.getUriFor(Settings.Global.WATCHDOG_TIMEOUT_MILLIS),
499                 false,
500                 new SettingsObserver(context, this),
501                 UserHandle.USER_SYSTEM);
502     }
503 
504     /**
505      * Updates watchdog timeout values.
506      */
updateWatchdogTimeout(long timeoutMillis)507     void updateWatchdogTimeout(long timeoutMillis) {
508         // See the notes on DEFAULT_TIMEOUT.
509         if (!DB && timeoutMillis <= ZygoteConnectionConstants.WRAPPED_PID_TIMEOUT_MILLIS) {
510             timeoutMillis = ZygoteConnectionConstants.WRAPPED_PID_TIMEOUT_MILLIS + 1;
511         }
512         mWatchdogTimeoutMillis = timeoutMillis;
513         Slog.i(TAG, "Watchdog timeout updated to " + mWatchdogTimeoutMillis + " millis");
514     }
515 
isInterestingJavaProcess(String processName)516     private static boolean isInterestingJavaProcess(String processName) {
517         return processName.equals(StorageManagerService.sMediaStoreAuthorityProcessName)
518                 || processName.equals("com.android.phone");
519     }
520 
521     /**
522      * Notifies the watchdog when a Java process with {@code pid} is started.
523      * This process may have its stack trace dumped during an ANR.
524      */
processStarted(String processName, int pid)525     public void processStarted(String processName, int pid) {
526         if (isInterestingJavaProcess(processName)) {
527             Slog.i(TAG, "Interesting Java process " + processName + " started. Pid " + pid);
528             synchronized (mLock) {
529                 mInterestingJavaPids.add(pid);
530             }
531         }
532     }
533 
534     /**
535      * Notifies the watchdog when a Java process with {@code pid} dies.
536      */
processDied(String processName, int pid)537     public void processDied(String processName, int pid) {
538         if (isInterestingJavaProcess(processName)) {
539             Slog.i(TAG, "Interesting Java process " + processName + " died. Pid " + pid);
540             synchronized (mLock) {
541                 mInterestingJavaPids.remove(Integer.valueOf(pid));
542             }
543         }
544     }
545 
setActivityController(IActivityController controller)546     public void setActivityController(IActivityController controller) {
547         synchronized (mLock) {
548             mController = controller;
549         }
550     }
551 
setAllowRestart(boolean allowRestart)552     public void setAllowRestart(boolean allowRestart) {
553         synchronized (mLock) {
554             mAllowRestart = allowRestart;
555         }
556     }
557 
addMonitor(Monitor monitor)558     public void addMonitor(Monitor monitor) {
559         synchronized (mLock) {
560             mMonitorChecker.addMonitorLocked(monitor);
561         }
562     }
563 
addThread(Handler thread)564     public void addThread(Handler thread) {
565         synchronized (mLock) {
566             final String name = thread.getLooper().getThread().getName();
567             mHandlerCheckers.add(withDefaultTimeout(new HandlerChecker(thread, name)));
568         }
569     }
570 
addThread(Handler thread, long timeoutMillis)571     public void addThread(Handler thread, long timeoutMillis) {
572         synchronized (mLock) {
573             final String name = thread.getLooper().getThread().getName();
574             mHandlerCheckers.add(
575                     withCustomTimeout(new HandlerChecker(thread, name), timeoutMillis));
576         }
577     }
578 
579     /**
580      * Pauses Watchdog action for the currently running thread. Useful before executing long running
581      * operations that could falsely trigger the watchdog. Each call to this will require a matching
582      * call to {@link #resumeWatchingCurrentThread}.
583      *
584      * <p>If the current thread has not been added to the Watchdog, this call is a no-op.
585      *
586      * <p>If the Watchdog is already paused for the current thread, this call adds
587      * adds another pause and will require an additional {@link #resumeCurrentThread} to resume.
588      *
589      * <p>Note: Use with care, as any deadlocks on the current thread will be undetected until all
590      * pauses have been resumed.
591      */
pauseWatchingCurrentThread(String reason)592     public void pauseWatchingCurrentThread(String reason) {
593         synchronized (mLock) {
594             for (HandlerCheckerAndTimeout hc : mHandlerCheckers) {
595                 HandlerChecker checker = hc.checker();
596                 if (Thread.currentThread().equals(checker.getThread())) {
597                     checker.pauseLocked(reason);
598                 }
599             }
600         }
601     }
602 
603     /**
604      * Resumes the last pause from {@link #pauseWatchingCurrentThread} for the currently running
605      * thread.
606      *
607      * <p>If the current thread has not been added to the Watchdog, this call is a no-op.
608      *
609      * <p>If the Watchdog action for the current thread is already resumed, this call logs a wtf.
610      *
611      * <p>If all pauses have been resumed, the Watchdog action is finally resumed, otherwise,
612      * the Watchdog action for the current thread remains paused until resume is called at least
613      * as many times as the calls to pause.
614      */
resumeWatchingCurrentThread(String reason)615     public void resumeWatchingCurrentThread(String reason) {
616         synchronized (mLock) {
617             for (HandlerCheckerAndTimeout hc : mHandlerCheckers) {
618                 HandlerChecker checker = hc.checker();
619                 if (Thread.currentThread().equals(checker.getThread())) {
620                     checker.resumeLocked(reason);
621                 }
622             }
623         }
624     }
625 
626     /**
627      * Perform a full reboot of the system.
628      */
rebootSystem(String reason)629     void rebootSystem(String reason) {
630         Slog.i(TAG, "Rebooting system because: " + reason);
631         IPowerManager pms = (IPowerManager)ServiceManager.getService(Context.POWER_SERVICE);
632         try {
633             pms.reboot(false, reason, false);
634         } catch (RemoteException ex) {
635         }
636     }
637 
evaluateCheckerCompletionLocked()638     private int evaluateCheckerCompletionLocked() {
639         int state = COMPLETED;
640         for (int i=0; i<mHandlerCheckers.size(); i++) {
641             HandlerChecker hc = mHandlerCheckers.get(i).checker();
642             state = Math.max(state, hc.getCompletionStateLocked());
643         }
644         return state;
645     }
646 
getCheckersWithStateLocked(int completionState)647     private ArrayList<HandlerChecker> getCheckersWithStateLocked(int completionState) {
648         ArrayList<HandlerChecker> checkers = new ArrayList<HandlerChecker>();
649         for (int i=0; i<mHandlerCheckers.size(); i++) {
650             HandlerChecker hc = mHandlerCheckers.get(i).checker();
651             if (hc.getCompletionStateLocked() == completionState) {
652                 checkers.add(hc);
653             }
654         }
655         return checkers;
656     }
657 
describeCheckersLocked(List<HandlerChecker> checkers)658     private String describeCheckersLocked(List<HandlerChecker> checkers) {
659         StringBuilder builder = new StringBuilder(128);
660         for (int i=0; i<checkers.size(); i++) {
661             if (builder.length() > 0) {
662                 builder.append(", ");
663             }
664             builder.append(checkers.get(i).describeBlockedStateLocked());
665         }
666         return builder.toString();
667     }
668 
addInterestingHidlPids(HashSet<Integer> pids)669     private static void addInterestingHidlPids(HashSet<Integer> pids) {
670         try {
671             IServiceManager serviceManager = IServiceManager.getService();
672             ArrayList<IServiceManager.InstanceDebugInfo> dump =
673                     serviceManager.debugDump();
674             for (IServiceManager.InstanceDebugInfo info : dump) {
675                 if (info.pid == IServiceManager.PidConstant.NO_PID) {
676                     continue;
677                 }
678 
679                 if (!HAL_INTERFACES_OF_INTEREST.contains(info.interfaceName)) {
680                     continue;
681                 }
682 
683                 pids.add(info.pid);
684             }
685         } catch (RemoteException e) {
686             Log.w(TAG, e);
687         }
688     }
689 
addInterestingAidlPids(HashSet<Integer> pids)690     private static void addInterestingAidlPids(HashSet<Integer> pids) {
691         ServiceDebugInfo[] infos = ServiceManager.getServiceDebugInfo();
692         if (infos == null) return;
693 
694         for (ServiceDebugInfo info : infos) {
695             for (String prefix : AIDL_INTERFACE_PREFIXES_OF_INTEREST) {
696                 if (info.name.startsWith(prefix)) {
697                     pids.add(info.debugPid);
698                 }
699             }
700         }
701     }
702 
getInterestingNativePids()703     static ArrayList<Integer> getInterestingNativePids() {
704         HashSet<Integer> pids = new HashSet<>();
705         addInterestingAidlPids(pids);
706         addInterestingHidlPids(pids);
707 
708         int[] nativePids = Process.getPidsForCommands(NATIVE_STACKS_OF_INTEREST);
709         if (nativePids != null) {
710             for (int i : nativePids) {
711                 pids.add(i);
712             }
713         }
714 
715         return new ArrayList<Integer>(pids);
716     }
717 
run()718     private void run() {
719         boolean waitedHalf = false;
720 
721         while (true) {
722             List<HandlerChecker> blockedCheckers = Collections.emptyList();
723             String subject = "";
724             boolean allowRestart = true;
725             int debuggerWasConnected = 0;
726             boolean doWaitedHalfDump = false;
727             // The value of mWatchdogTimeoutMillis might change while we are executing the loop.
728             // We store the current value to use a consistent value for all handlers.
729             final long watchdogTimeoutMillis = mWatchdogTimeoutMillis;
730             final long checkIntervalMillis = watchdogTimeoutMillis / 2;
731             final ArrayList<Integer> pids;
732             synchronized (mLock) {
733                 long timeout = checkIntervalMillis;
734                 // Make sure we (re)spin the checkers that have become idle within
735                 // this wait-and-check interval
736                 for (int i=0; i<mHandlerCheckers.size(); i++) {
737                     HandlerCheckerAndTimeout hc = mHandlerCheckers.get(i);
738                     // We pick the watchdog to apply every time we reschedule the checkers. The
739                     // default timeout might have changed since the last run.
740                     hc.checker().scheduleCheckLocked(hc.customTimeoutMillis()
741                             .orElse(watchdogTimeoutMillis * Build.HW_TIMEOUT_MULTIPLIER));
742                 }
743 
744                 if (debuggerWasConnected > 0) {
745                     debuggerWasConnected--;
746                 }
747 
748                 // NOTE: We use uptimeMillis() here because we do not want to increment the time we
749                 // wait while asleep. If the device is asleep then the thing that we are waiting
750                 // to timeout on is asleep as well and won't have a chance to run, causing a false
751                 // positive on when to kill things.
752                 long start = SystemClock.uptimeMillis();
753                 while (timeout > 0) {
754                     if (Debug.isDebuggerConnected()) {
755                         debuggerWasConnected = 2;
756                     }
757                     try {
758                         mLock.wait(timeout);
759                         // Note: mHandlerCheckers and mMonitorChecker may have changed after waiting
760                     } catch (InterruptedException e) {
761                         Log.wtf(TAG, e);
762                     }
763                     if (Debug.isDebuggerConnected()) {
764                         debuggerWasConnected = 2;
765                     }
766                     timeout = checkIntervalMillis - (SystemClock.uptimeMillis() - start);
767                 }
768 
769                 final int waitState = evaluateCheckerCompletionLocked();
770                 if (waitState == COMPLETED) {
771                     // The monitors have returned; reset
772                     waitedHalf = false;
773                     continue;
774                 } else if (waitState == WAITING) {
775                     // still waiting but within their configured intervals; back off and recheck
776                     continue;
777                 } else if (waitState == WAITED_HALF) {
778                     if (!waitedHalf) {
779                         Slog.i(TAG, "WAITED_HALF");
780                         waitedHalf = true;
781                         // We've waited half, but we'd need to do the stack trace dump w/o the lock.
782                         blockedCheckers = getCheckersWithStateLocked(WAITED_HALF);
783                         subject = describeCheckersLocked(blockedCheckers);
784                         pids = new ArrayList<>(mInterestingJavaPids);
785                         doWaitedHalfDump = true;
786                     } else {
787                         continue;
788                     }
789                 } else {
790                     // something is overdue!
791                     blockedCheckers = getCheckersWithStateLocked(OVERDUE);
792                     subject = describeCheckersLocked(blockedCheckers);
793                     allowRestart = mAllowRestart;
794                     pids = new ArrayList<>(mInterestingJavaPids);
795                 }
796             } // END synchronized (mLock)
797 
798             // If we got here, that means that the system is most likely hung.
799             //
800             // First collect stack traces from all threads of the system process.
801             //
802             // Then, if we reached the full timeout, kill this process so that the system will
803             // restart. If we reached half of the timeout, just log some information and continue.
804             logWatchog(doWaitedHalfDump, subject, pids);
805 
806             if (doWaitedHalfDump) {
807                 // We have waited for only half of the timeout, we continue to wait for the duration
808                 // of the full timeout before killing the process.
809                 continue;
810             }
811 
812             IActivityController controller;
813             synchronized (mLock) {
814                 controller = mController;
815             }
816             if (controller != null) {
817                 Slog.i(TAG, "Reporting stuck state to activity controller");
818                 try {
819                     Binder.setDumpDisabled("Service dumps disabled due to hung system process.");
820                     // 1 = keep waiting, -1 = kill system
821                     int res = controller.systemNotResponding(subject);
822                     if (res >= 0) {
823                         Slog.i(TAG, "Activity controller requested to coninue to wait");
824                         waitedHalf = false;
825                         continue;
826                     }
827                 } catch (RemoteException e) {
828                 }
829             }
830 
831             // Only kill the process if the debugger is not attached.
832             if (Debug.isDebuggerConnected()) {
833                 debuggerWasConnected = 2;
834             }
835             if (debuggerWasConnected >= 2) {
836                 Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
837             } else if (debuggerWasConnected > 0) {
838                 Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process");
839             } else if (!allowRestart) {
840                 Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process");
841             } else {
842                 Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject);
843                 WatchdogDiagnostics.diagnoseCheckers(blockedCheckers);
844                 Slog.w(TAG, "*** GOODBYE!");
845                 if (!Build.IS_USER && isCrashLoopFound()
846                         && !WatchdogProperties.should_ignore_fatal_count().orElse(false)) {
847                     breakCrashLoop();
848                 }
849                 Process.killProcess(Process.myPid());
850                 System.exit(10);
851             }
852 
853             waitedHalf = false;
854         }
855     }
856 
logWatchog(boolean halfWatchdog, String subject, ArrayList<Integer> pids)857     private void logWatchog(boolean halfWatchdog, String subject, ArrayList<Integer> pids) {
858         // Get critical event log before logging the half watchdog so that it doesn't
859         // occur in the log.
860         String criticalEvents =
861                 CriticalEventLog.getInstance().logLinesForSystemServerTraceFile();
862         final UUID errorId = mTraceErrorLogger.generateErrorId();
863         if (mTraceErrorLogger.isAddErrorIdEnabled()) {
864             mTraceErrorLogger.addErrorIdToTrace("system_server", errorId);
865             mTraceErrorLogger.addSubjectToTrace(subject, errorId);
866         }
867 
868         final String dropboxTag;
869         if (halfWatchdog) {
870             dropboxTag = "pre_watchdog";
871             CriticalEventLog.getInstance().logHalfWatchdog(subject);
872         } else {
873             dropboxTag = "watchdog";
874             CriticalEventLog.getInstance().logWatchdog(subject, errorId);
875             EventLog.writeEvent(EventLogTags.WATCHDOG, subject);
876             // Log the atom as early as possible since it is used as a mechanism to trigger
877             // Perfetto. Ideally, the Perfetto trace capture should happen as close to the
878             // point in time when the Watchdog happens as possible.
879             FrameworkStatsLog.write(FrameworkStatsLog.SYSTEM_SERVER_WATCHDOG_OCCURRED, subject);
880         }
881 
882         long anrTime = SystemClock.uptimeMillis();
883         StringBuilder report = new StringBuilder();
884         report.append(MemoryPressureUtil.currentPsiState());
885         ProcessCpuTracker processCpuTracker = new ProcessCpuTracker(false);
886         StringWriter tracesFileException = new StringWriter();
887         final File stack = ActivityManagerService.dumpStackTraces(
888                 pids, processCpuTracker, new SparseArray<>(), getInterestingNativePids(),
889                 tracesFileException, subject, criticalEvents);
890         // Give some extra time to make sure the stack traces get written.
891         // The system's been hanging for a whlie, another second or two won't hurt much.
892         SystemClock.sleep(5000);
893         processCpuTracker.update();
894         report.append(processCpuTracker.printCurrentState(anrTime));
895         report.append(tracesFileException.getBuffer());
896 
897         if (!halfWatchdog) {
898             // Trigger the kernel to dump all blocked threads, and backtraces on all CPUs to the
899             // kernel log
900             doSysRq('w');
901             doSysRq('l');
902         }
903 
904         // Try to add the error to the dropbox, but assuming that the ActivityManager
905         // itself may be deadlocked.  (which has happened, causing this statement to
906         // deadlock and the watchdog as a whole to be ineffective)
907         Thread dropboxThread = new Thread("watchdogWriteToDropbox") {
908                 public void run() {
909                     // If a watched thread hangs before init() is called, we don't have a
910                     // valid mActivity. So we can't log the error to dropbox.
911                     if (mActivity != null) {
912                         mActivity.addErrorToDropBox(
913                                 dropboxTag, null, "system_server", null, null, null,
914                                 null, report.toString(), stack, null, null, null,
915                                 errorId);
916                     }
917                 }
918             };
919         dropboxThread.start();
920         try {
921             dropboxThread.join(2000);  // wait up to 2 seconds for it to return.
922         } catch (InterruptedException ignored) { }
923     }
924 
doSysRq(char c)925     private void doSysRq(char c) {
926         try {
927             FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger");
928             sysrq_trigger.write(c);
929             sysrq_trigger.close();
930         } catch (IOException e) {
931             Slog.w(TAG, "Failed to write to /proc/sysrq-trigger", e);
932         }
933     }
934 
resetTimeoutHistory()935     private void resetTimeoutHistory() {
936         writeTimeoutHistory(new ArrayList<String>());
937     }
938 
writeTimeoutHistory(Iterable<String> crashHistory)939     private void writeTimeoutHistory(Iterable<String> crashHistory) {
940         String data = String.join(",", crashHistory);
941 
942         try (FileWriter writer = new FileWriter(TIMEOUT_HISTORY_FILE)) {
943             writer.write(SystemProperties.get("ro.boottime.zygote"));
944             writer.write(":");
945             writer.write(data);
946         } catch (IOException e) {
947             Slog.e(TAG, "Failed to write file " + TIMEOUT_HISTORY_FILE, e);
948         }
949     }
950 
readTimeoutHistory()951     private String[] readTimeoutHistory() {
952         final String[] emptyStringArray = {};
953 
954         try (BufferedReader reader = new BufferedReader(new FileReader(TIMEOUT_HISTORY_FILE))) {
955             String line = reader.readLine();
956             if (line == null) {
957                 return emptyStringArray;
958             }
959 
960             String[] data = line.trim().split(":");
961             String boottime = data.length >= 1 ? data[0] : "";
962             String history = data.length >= 2 ? data[1] : "";
963             if (SystemProperties.get("ro.boottime.zygote").equals(boottime) && !history.isEmpty()) {
964                 return history.split(",");
965             } else {
966                 return emptyStringArray;
967             }
968         } catch (FileNotFoundException e) {
969             return emptyStringArray;
970         } catch (IOException e) {
971             Slog.e(TAG, "Failed to read file " + TIMEOUT_HISTORY_FILE, e);
972             return emptyStringArray;
973         }
974     }
975 
hasActiveUsbConnection()976     private boolean hasActiveUsbConnection() {
977         try {
978             final String state = FileUtils.readTextFile(
979                     new File("/sys/class/android_usb/android0/state"),
980                     128 /*max*/, null /*ellipsis*/).trim();
981             if ("CONFIGURED".equals(state)) {
982                 return true;
983             }
984         } catch (IOException e) {
985             Slog.w(TAG, "Failed to determine if device was on USB", e);
986         }
987         return false;
988     }
989 
isCrashLoopFound()990     private boolean isCrashLoopFound() {
991         int fatalCount = WatchdogProperties.fatal_count().orElse(0);
992         long fatalWindowMs = TimeUnit.SECONDS.toMillis(
993                 WatchdogProperties.fatal_window_seconds().orElse(0));
994         if (fatalCount == 0 || fatalWindowMs == 0) {
995             if (fatalCount != fatalWindowMs) {
996                 Slog.w(TAG, String.format("sysprops '%s' and '%s' should be set or unset together",
997                             PROP_FATAL_LOOP_COUNT, PROP_FATAL_LOOP_WINDOWS_SECS));
998             }
999             return false;
1000         }
1001 
1002         // new-history = [last (fatalCount - 1) items in old-history] + [nowMs].
1003         long nowMs = SystemClock.elapsedRealtime(); // Time since boot including deep sleep.
1004         String[] rawCrashHistory = readTimeoutHistory();
1005         ArrayList<String> crashHistory = new ArrayList<String>(Arrays.asList(Arrays.copyOfRange(
1006                         rawCrashHistory,
1007                         Math.max(0, rawCrashHistory.length - fatalCount - 1),
1008                         rawCrashHistory.length)));
1009         // Something wrong here.
1010         crashHistory.add(String.valueOf(nowMs));
1011         writeTimeoutHistory(crashHistory);
1012 
1013         // Returns false if the device has an active USB connection.
1014         if (hasActiveUsbConnection()) {
1015             return false;
1016         }
1017 
1018         long firstCrashMs;
1019         try {
1020             firstCrashMs = Long.parseLong(crashHistory.get(0));
1021         } catch (NumberFormatException t) {
1022             Slog.w(TAG, "Failed to parseLong " + crashHistory.get(0), t);
1023             resetTimeoutHistory();
1024             return false;
1025         }
1026         return crashHistory.size() >= fatalCount && nowMs - firstCrashMs < fatalWindowMs;
1027     }
1028 
breakCrashLoop()1029     private void breakCrashLoop() {
1030         try (FileWriter kmsg = new FileWriter("/dev/kmsg_debug", /* append= */ true)) {
1031             kmsg.append("Fatal reset to escape the system_server crashing loop\n");
1032         } catch (IOException e) {
1033             Slog.w(TAG, "Failed to append to kmsg", e);
1034         }
1035         doSysRq('c');
1036     }
1037 
1038     @Override
dump(@onNull PrintWriter pw, @Nullable String[] args)1039     public void dump(@NonNull PrintWriter pw, @Nullable String[] args) {
1040         pw.print("WatchdogTimeoutMillis=");
1041         pw.println(mWatchdogTimeoutMillis);
1042     }
1043 }
1044