• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2008 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.android.server;
18 
19 import android.app.IActivityController;
20 import android.content.BroadcastReceiver;
21 import android.content.Context;
22 import android.content.Intent;
23 import android.content.IntentFilter;
24 import android.hidl.manager.V1_0.IServiceManager;
25 import android.os.Binder;
26 import android.os.Build;
27 import android.os.Debug;
28 import android.os.FileUtils;
29 import android.os.Handler;
30 import android.os.IPowerManager;
31 import android.os.Looper;
32 import android.os.Process;
33 import android.os.RemoteException;
34 import android.os.ServiceDebugInfo;
35 import android.os.ServiceManager;
36 import android.os.SystemClock;
37 import android.os.SystemProperties;
38 import android.sysprop.WatchdogProperties;
39 import android.util.EventLog;
40 import android.util.Log;
41 import android.util.Slog;
42 import android.util.SparseArray;
43 
44 import com.android.internal.os.ProcessCpuTracker;
45 import com.android.internal.os.ZygoteConnectionConstants;
46 import com.android.internal.util.FrameworkStatsLog;
47 import com.android.server.am.ActivityManagerService;
48 import com.android.server.am.TraceErrorLogger;
49 import com.android.server.wm.SurfaceAnimationThread;
50 
51 import java.io.BufferedReader;
52 import java.io.File;
53 import java.io.FileNotFoundException;
54 import java.io.FileReader;
55 import java.io.FileWriter;
56 import java.io.IOException;
57 import java.io.StringWriter;
58 import java.util.ArrayList;
59 import java.util.Arrays;
60 import java.util.Collections;
61 import java.util.HashSet;
62 import java.util.List;
63 import java.util.UUID;
64 import java.util.concurrent.TimeUnit;
65 
66 /** This class calls its monitor every minute. Killing this process if they don't return **/
67 public class Watchdog {
68     static final String TAG = "Watchdog";
69 
70     /** Debug flag. */
71     public static final boolean DEBUG = false;
72 
73     // Set this to true to use debug default values.
74     private static final boolean DB = false;
75 
76     // Note 1: Do not lower this value below thirty seconds without tightening the invoke-with
77     //         timeout in com.android.internal.os.ZygoteConnection, or wrapped applications
78     //         can trigger the watchdog.
79     // Note 2: The debug value is already below the wait time in ZygoteConnection. Wrapped
80     //         applications may not work with a debug build. CTS will fail.
81     private static final long DEFAULT_TIMEOUT =
82             (DB ? 10 * 1000 : 60 * 1000) * Build.HW_TIMEOUT_MULTIPLIER;
83     private static final long CHECK_INTERVAL = DEFAULT_TIMEOUT / 2;
84 
85     // These are temporally ordered: larger values as lateness increases
86     private static final int COMPLETED = 0;
87     private static final int WAITING = 1;
88     private static final int WAITED_HALF = 2;
89     private static final int OVERDUE = 3;
90 
91     // Track watchdog timeout history and break the crash loop if there is.
92     private static final String TIMEOUT_HISTORY_FILE = "/data/system/watchdog-timeout-history.txt";
93     private static final String PROP_FATAL_LOOP_COUNT = "framework_watchdog.fatal_count";
94     private static final String PROP_FATAL_LOOP_WINDOWS_SECS =
95             "framework_watchdog.fatal_window.second";
96 
97     // Which native processes to dump into dropbox's stack traces
98     public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] {
99         "/system/bin/audioserver",
100         "/system/bin/cameraserver",
101         "/system/bin/drmserver",
102         "/system/bin/keystore2",
103         "/system/bin/mediadrmserver",
104         "/system/bin/mediaserver",
105         "/system/bin/netd",
106         "/system/bin/sdcard",
107         "/system/bin/surfaceflinger",
108         "/system/bin/vold",
109         "media.extractor", // system/bin/mediaextractor
110         "media.metrics", // system/bin/mediametrics
111         "media.codec", // vendor/bin/hw/android.hardware.media.omx@1.0-service
112         "media.swcodec", // /apex/com.android.media.swcodec/bin/mediaswcodec
113         "media.transcoding", // Media transcoding service
114         "com.android.bluetooth",  // Bluetooth service
115         "/apex/com.android.os.statsd/bin/statsd",  // Stats daemon
116     };
117 
118     public static final List<String> HAL_INTERFACES_OF_INTEREST = Arrays.asList(
119             "android.hardware.audio@4.0::IDevicesFactory",
120             "android.hardware.audio@5.0::IDevicesFactory",
121             "android.hardware.audio@6.0::IDevicesFactory",
122             "android.hardware.audio@7.0::IDevicesFactory",
123             "android.hardware.biometrics.face@1.0::IBiometricsFace",
124             "android.hardware.biometrics.fingerprint@2.1::IBiometricsFingerprint",
125             "android.hardware.bluetooth@1.0::IBluetoothHci",
126             "android.hardware.camera.provider@2.4::ICameraProvider",
127             "android.hardware.gnss@1.0::IGnss",
128             "android.hardware.graphics.allocator@2.0::IAllocator",
129             "android.hardware.graphics.composer@2.1::IComposer",
130             "android.hardware.health@2.0::IHealth",
131             "android.hardware.light@2.0::ILight",
132             "android.hardware.media.c2@1.0::IComponentStore",
133             "android.hardware.media.omx@1.0::IOmx",
134             "android.hardware.media.omx@1.0::IOmxStore",
135             "android.hardware.neuralnetworks@1.0::IDevice",
136             "android.hardware.power.stats@1.0::IPowerStats",
137             "android.hardware.sensors@1.0::ISensors",
138             "android.hardware.sensors@2.0::ISensors",
139             "android.hardware.sensors@2.1::ISensors",
140             "android.hardware.vr@1.0::IVr",
141             "android.system.suspend@1.0::ISystemSuspend"
142     );
143 
144     public static final String[] AIDL_INTERFACE_PREFIXES_OF_INTEREST = new String[] {
145             "android.hardware.light.ILights/",
146             "android.hardware.power.stats.IPowerStats/",
147     };
148 
149     private static Watchdog sWatchdog;
150 
151     private final Thread mThread;
152 
153     private final Object mLock = new Object();
154 
155     /* This handler will be used to post message back onto the main thread */
156     private final ArrayList<HandlerChecker> mHandlerCheckers = new ArrayList<>();
157     private final HandlerChecker mMonitorChecker;
158     private ActivityManagerService mActivity;
159 
160     private IActivityController mController;
161     private boolean mAllowRestart = true;
162     private final List<Integer> mInterestingJavaPids = new ArrayList<>();
163 
164     private final TraceErrorLogger mTraceErrorLogger;
165 
166     /**
167      * Used for checking status of handle threads and scheduling monitor callbacks.
168      */
169     public final class HandlerChecker implements Runnable {
170         private final Handler mHandler;
171         private final String mName;
172         private final long mWaitMax;
173         private final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>();
174         private final ArrayList<Monitor> mMonitorQueue = new ArrayList<Monitor>();
175         private boolean mCompleted;
176         private Monitor mCurrentMonitor;
177         private long mStartTime;
178         private int mPauseCount;
179 
HandlerChecker(Handler handler, String name, long waitMaxMillis)180         HandlerChecker(Handler handler, String name, long waitMaxMillis) {
181             mHandler = handler;
182             mName = name;
183             mWaitMax = waitMaxMillis;
184             mCompleted = true;
185         }
186 
addMonitorLocked(Monitor monitor)187         void addMonitorLocked(Monitor monitor) {
188             // We don't want to update mMonitors when the Handler is in the middle of checking
189             // all monitors. We will update mMonitors on the next schedule if it is safe
190             mMonitorQueue.add(monitor);
191         }
192 
scheduleCheckLocked()193         public void scheduleCheckLocked() {
194             if (mCompleted) {
195                 // Safe to update monitors in queue, Handler is not in the middle of work
196                 mMonitors.addAll(mMonitorQueue);
197                 mMonitorQueue.clear();
198             }
199             if ((mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling())
200                     || (mPauseCount > 0)) {
201                 // Don't schedule until after resume OR
202                 // If the target looper has recently been polling, then
203                 // there is no reason to enqueue our checker on it since that
204                 // is as good as it not being deadlocked.  This avoid having
205                 // to do a context switch to check the thread. Note that we
206                 // only do this if we have no monitors since those would need to
207                 // be executed at this point.
208                 mCompleted = true;
209                 return;
210             }
211             if (!mCompleted) {
212                 // we already have a check in flight, so no need
213                 return;
214             }
215 
216             mCompleted = false;
217             mCurrentMonitor = null;
218             mStartTime = SystemClock.uptimeMillis();
219             mHandler.postAtFrontOfQueue(this);
220         }
221 
isOverdueLocked()222         boolean isOverdueLocked() {
223             return (!mCompleted) && (SystemClock.uptimeMillis() > mStartTime + mWaitMax);
224         }
225 
getCompletionStateLocked()226         public int getCompletionStateLocked() {
227             if (mCompleted) {
228                 return COMPLETED;
229             } else {
230                 long latency = SystemClock.uptimeMillis() - mStartTime;
231                 if (latency < mWaitMax/2) {
232                     return WAITING;
233                 } else if (latency < mWaitMax) {
234                     return WAITED_HALF;
235                 }
236             }
237             return OVERDUE;
238         }
239 
getThread()240         public Thread getThread() {
241             return mHandler.getLooper().getThread();
242         }
243 
getName()244         public String getName() {
245             return mName;
246         }
247 
describeBlockedStateLocked()248         String describeBlockedStateLocked() {
249             if (mCurrentMonitor == null) {
250                 return "Blocked in handler on " + mName + " (" + getThread().getName() + ")";
251             } else {
252                 return "Blocked in monitor " + mCurrentMonitor.getClass().getName()
253                         + " on " + mName + " (" + getThread().getName() + ")";
254             }
255         }
256 
257         @Override
run()258         public void run() {
259             // Once we get here, we ensure that mMonitors does not change even if we call
260             // #addMonitorLocked because we first add the new monitors to mMonitorQueue and
261             // move them to mMonitors on the next schedule when mCompleted is true, at which
262             // point we have completed execution of this method.
263             final int size = mMonitors.size();
264             for (int i = 0 ; i < size ; i++) {
265                 synchronized (mLock) {
266                     mCurrentMonitor = mMonitors.get(i);
267                 }
268                 mCurrentMonitor.monitor();
269             }
270 
271             synchronized (mLock) {
272                 mCompleted = true;
273                 mCurrentMonitor = null;
274             }
275         }
276 
277         /** Pause the HandlerChecker. */
pauseLocked(String reason)278         public void pauseLocked(String reason) {
279             mPauseCount++;
280             // Mark as completed, because there's a chance we called this after the watchog
281             // thread loop called Object#wait after 'WAITED_HALF'. In that case we want to ensure
282             // the next call to #getCompletionStateLocked for this checker returns 'COMPLETED'
283             mCompleted = true;
284             Slog.i(TAG, "Pausing HandlerChecker: " + mName + " for reason: "
285                     + reason + ". Pause count: " + mPauseCount);
286         }
287 
288         /** Resume the HandlerChecker from the last {@link #pauseLocked}. */
resumeLocked(String reason)289         public void resumeLocked(String reason) {
290             if (mPauseCount > 0) {
291                 mPauseCount--;
292                 Slog.i(TAG, "Resuming HandlerChecker: " + mName + " for reason: "
293                         + reason + ". Pause count: " + mPauseCount);
294             } else {
295                 Slog.wtf(TAG, "Already resumed HandlerChecker: " + mName);
296             }
297         }
298     }
299 
300     final class RebootRequestReceiver extends BroadcastReceiver {
301         @Override
onReceive(Context c, Intent intent)302         public void onReceive(Context c, Intent intent) {
303             if (intent.getIntExtra("nowait", 0) != 0) {
304                 rebootSystem("Received ACTION_REBOOT broadcast");
305                 return;
306             }
307             Slog.w(TAG, "Unsupported ACTION_REBOOT broadcast: " + intent);
308         }
309     }
310 
311     /** Monitor for checking the availability of binder threads. The monitor will block until
312      * there is a binder thread available to process in coming IPCs to make sure other processes
313      * can still communicate with the service.
314      */
315     private static final class BinderThreadMonitor implements Watchdog.Monitor {
316         @Override
monitor()317         public void monitor() {
318             Binder.blockUntilThreadAvailable();
319         }
320     }
321 
322     public interface Monitor {
monitor()323         void monitor();
324     }
325 
getInstance()326     public static Watchdog getInstance() {
327         if (sWatchdog == null) {
328             sWatchdog = new Watchdog();
329         }
330 
331         return sWatchdog;
332     }
333 
Watchdog()334     private Watchdog() {
335         mThread = new Thread(this::run, "watchdog");
336         // Initialize handler checkers for each common thread we want to check.  Note
337         // that we are not currently checking the background thread, since it can
338         // potentially hold longer running operations with no guarantees about the timeliness
339         // of operations there.
340 
341         // The shared foreground thread is the main checker.  It is where we
342         // will also dispatch monitor checks and do other work.
343         mMonitorChecker = new HandlerChecker(FgThread.getHandler(),
344                 "foreground thread", DEFAULT_TIMEOUT);
345         mHandlerCheckers.add(mMonitorChecker);
346         // Add checker for main thread.  We only do a quick check since there
347         // can be UI running on the thread.
348         mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()),
349                 "main thread", DEFAULT_TIMEOUT));
350         // Add checker for shared UI thread.
351         mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(),
352                 "ui thread", DEFAULT_TIMEOUT));
353         // And also check IO thread.
354         mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(),
355                 "i/o thread", DEFAULT_TIMEOUT));
356         // And the display thread.
357         mHandlerCheckers.add(new HandlerChecker(DisplayThread.getHandler(),
358                 "display thread", DEFAULT_TIMEOUT));
359         // And the animation thread.
360         mHandlerCheckers.add(new HandlerChecker(AnimationThread.getHandler(),
361                 "animation thread", DEFAULT_TIMEOUT));
362         // And the surface animation thread.
363         mHandlerCheckers.add(new HandlerChecker(SurfaceAnimationThread.getHandler(),
364                 "surface animation thread", DEFAULT_TIMEOUT));
365 
366         // Initialize monitor for Binder threads.
367         addMonitor(new BinderThreadMonitor());
368 
369         mInterestingJavaPids.add(Process.myPid());
370 
371         // See the notes on DEFAULT_TIMEOUT.
372         assert DB ||
373                 DEFAULT_TIMEOUT > ZygoteConnectionConstants.WRAPPED_PID_TIMEOUT_MILLIS;
374 
375         mTraceErrorLogger = new TraceErrorLogger();
376     }
377 
378     /**
379      * Called by SystemServer to cause the internal thread to begin execution.
380      */
start()381     public void start() {
382         mThread.start();
383     }
384 
385     /**
386      * Registers a {@link BroadcastReceiver} to listen to reboot broadcasts and trigger reboot.
387      * Should be called during boot after the ActivityManagerService is up and registered
388      * as a system service so it can handle registration of a {@link BroadcastReceiver}.
389      */
init(Context context, ActivityManagerService activity)390     public void init(Context context, ActivityManagerService activity) {
391         mActivity = activity;
392         context.registerReceiver(new RebootRequestReceiver(),
393                 new IntentFilter(Intent.ACTION_REBOOT),
394                 android.Manifest.permission.REBOOT, null);
395     }
396 
isInterestingJavaProcess(String processName)397     private static boolean isInterestingJavaProcess(String processName) {
398         return processName.equals(StorageManagerService.sMediaStoreAuthorityProcessName)
399                 || processName.equals("com.android.phone");
400     }
401 
402     /**
403      * Notifies the watchdog when a Java process with {@code pid} is started.
404      * This process may have its stack trace dumped during an ANR.
405      */
processStarted(String processName, int pid)406     public void processStarted(String processName, int pid) {
407         if (isInterestingJavaProcess(processName)) {
408             Slog.i(TAG, "Interesting Java process " + processName + " started. Pid " + pid);
409             synchronized (mLock) {
410                 mInterestingJavaPids.add(pid);
411             }
412         }
413     }
414 
415     /**
416      * Notifies the watchdog when a Java process with {@code pid} dies.
417      */
processDied(String processName, int pid)418     public void processDied(String processName, int pid) {
419         if (isInterestingJavaProcess(processName)) {
420             Slog.i(TAG, "Interesting Java process " + processName + " died. Pid " + pid);
421             synchronized (mLock) {
422                 mInterestingJavaPids.remove(Integer.valueOf(pid));
423             }
424         }
425     }
426 
setActivityController(IActivityController controller)427     public void setActivityController(IActivityController controller) {
428         synchronized (mLock) {
429             mController = controller;
430         }
431     }
432 
setAllowRestart(boolean allowRestart)433     public void setAllowRestart(boolean allowRestart) {
434         synchronized (mLock) {
435             mAllowRestart = allowRestart;
436         }
437     }
438 
addMonitor(Monitor monitor)439     public void addMonitor(Monitor monitor) {
440         synchronized (mLock) {
441             mMonitorChecker.addMonitorLocked(monitor);
442         }
443     }
444 
addThread(Handler thread)445     public void addThread(Handler thread) {
446         addThread(thread, DEFAULT_TIMEOUT);
447     }
448 
addThread(Handler thread, long timeoutMillis)449     public void addThread(Handler thread, long timeoutMillis) {
450         synchronized (mLock) {
451             final String name = thread.getLooper().getThread().getName();
452             mHandlerCheckers.add(new HandlerChecker(thread, name, timeoutMillis));
453         }
454     }
455 
456     /**
457      * Pauses Watchdog action for the currently running thread. Useful before executing long running
458      * operations that could falsely trigger the watchdog. Each call to this will require a matching
459      * call to {@link #resumeWatchingCurrentThread}.
460      *
461      * <p>If the current thread has not been added to the Watchdog, this call is a no-op.
462      *
463      * <p>If the Watchdog is already paused for the current thread, this call adds
464      * adds another pause and will require an additional {@link #resumeCurrentThread} to resume.
465      *
466      * <p>Note: Use with care, as any deadlocks on the current thread will be undetected until all
467      * pauses have been resumed.
468      */
pauseWatchingCurrentThread(String reason)469     public void pauseWatchingCurrentThread(String reason) {
470         synchronized (mLock) {
471             for (HandlerChecker hc : mHandlerCheckers) {
472                 if (Thread.currentThread().equals(hc.getThread())) {
473                     hc.pauseLocked(reason);
474                 }
475             }
476         }
477     }
478 
479     /**
480      * Resumes the last pause from {@link #pauseWatchingCurrentThread} for the currently running
481      * thread.
482      *
483      * <p>If the current thread has not been added to the Watchdog, this call is a no-op.
484      *
485      * <p>If the Watchdog action for the current thread is already resumed, this call logs a wtf.
486      *
487      * <p>If all pauses have been resumed, the Watchdog action is finally resumed, otherwise,
488      * the Watchdog action for the current thread remains paused until resume is called at least
489      * as many times as the calls to pause.
490      */
resumeWatchingCurrentThread(String reason)491     public void resumeWatchingCurrentThread(String reason) {
492         synchronized (mLock) {
493             for (HandlerChecker hc : mHandlerCheckers) {
494                 if (Thread.currentThread().equals(hc.getThread())) {
495                     hc.resumeLocked(reason);
496                 }
497             }
498         }
499     }
500 
501     /**
502      * Perform a full reboot of the system.
503      */
rebootSystem(String reason)504     void rebootSystem(String reason) {
505         Slog.i(TAG, "Rebooting system because: " + reason);
506         IPowerManager pms = (IPowerManager)ServiceManager.getService(Context.POWER_SERVICE);
507         try {
508             pms.reboot(false, reason, false);
509         } catch (RemoteException ex) {
510         }
511     }
512 
evaluateCheckerCompletionLocked()513     private int evaluateCheckerCompletionLocked() {
514         int state = COMPLETED;
515         for (int i=0; i<mHandlerCheckers.size(); i++) {
516             HandlerChecker hc = mHandlerCheckers.get(i);
517             state = Math.max(state, hc.getCompletionStateLocked());
518         }
519         return state;
520     }
521 
getBlockedCheckersLocked()522     private ArrayList<HandlerChecker> getBlockedCheckersLocked() {
523         ArrayList<HandlerChecker> checkers = new ArrayList<HandlerChecker>();
524         for (int i=0; i<mHandlerCheckers.size(); i++) {
525             HandlerChecker hc = mHandlerCheckers.get(i);
526             if (hc.isOverdueLocked()) {
527                 checkers.add(hc);
528             }
529         }
530         return checkers;
531     }
532 
describeCheckersLocked(List<HandlerChecker> checkers)533     private String describeCheckersLocked(List<HandlerChecker> checkers) {
534         StringBuilder builder = new StringBuilder(128);
535         for (int i=0; i<checkers.size(); i++) {
536             if (builder.length() > 0) {
537                 builder.append(", ");
538             }
539             builder.append(checkers.get(i).describeBlockedStateLocked());
540         }
541         return builder.toString();
542     }
543 
addInterestingHidlPids(HashSet<Integer> pids)544     private static void addInterestingHidlPids(HashSet<Integer> pids) {
545         try {
546             IServiceManager serviceManager = IServiceManager.getService();
547             ArrayList<IServiceManager.InstanceDebugInfo> dump =
548                     serviceManager.debugDump();
549             for (IServiceManager.InstanceDebugInfo info : dump) {
550                 if (info.pid == IServiceManager.PidConstant.NO_PID) {
551                     continue;
552                 }
553 
554                 if (!HAL_INTERFACES_OF_INTEREST.contains(info.interfaceName)) {
555                     continue;
556                 }
557 
558                 pids.add(info.pid);
559             }
560         } catch (RemoteException e) {
561             Log.w(TAG, e);
562         }
563     }
564 
addInterestingAidlPids(HashSet<Integer> pids)565     private static void addInterestingAidlPids(HashSet<Integer> pids) {
566         ServiceDebugInfo[] infos = ServiceManager.getServiceDebugInfo();
567         if (infos == null) return;
568 
569         for (ServiceDebugInfo info : infos) {
570             for (String prefix : AIDL_INTERFACE_PREFIXES_OF_INTEREST) {
571                 if (info.name.startsWith(prefix)) {
572                     pids.add(info.debugPid);
573                 }
574             }
575         }
576     }
577 
getInterestingNativePids()578     static ArrayList<Integer> getInterestingNativePids() {
579         HashSet<Integer> pids = new HashSet<>();
580         addInterestingAidlPids(pids);
581         addInterestingHidlPids(pids);
582 
583         int[] nativePids = Process.getPidsForCommands(NATIVE_STACKS_OF_INTEREST);
584         if (nativePids != null) {
585             for (int i : nativePids) {
586                 pids.add(i);
587             }
588         }
589 
590         return new ArrayList<Integer>(pids);
591     }
592 
run()593     private void run() {
594         boolean waitedHalf = false;
595         while (true) {
596             List<HandlerChecker> blockedCheckers = Collections.emptyList();
597             String subject = "";
598             boolean allowRestart = true;
599             int debuggerWasConnected = 0;
600             boolean doWaitedHalfDump = false;
601             final ArrayList<Integer> pids;
602             synchronized (mLock) {
603                 long timeout = CHECK_INTERVAL;
604                 // Make sure we (re)spin the checkers that have become idle within
605                 // this wait-and-check interval
606                 for (int i=0; i<mHandlerCheckers.size(); i++) {
607                     HandlerChecker hc = mHandlerCheckers.get(i);
608                     hc.scheduleCheckLocked();
609                 }
610 
611                 if (debuggerWasConnected > 0) {
612                     debuggerWasConnected--;
613                 }
614 
615                 // NOTE: We use uptimeMillis() here because we do not want to increment the time we
616                 // wait while asleep. If the device is asleep then the thing that we are waiting
617                 // to timeout on is asleep as well and won't have a chance to run, causing a false
618                 // positive on when to kill things.
619                 long start = SystemClock.uptimeMillis();
620                 while (timeout > 0) {
621                     if (Debug.isDebuggerConnected()) {
622                         debuggerWasConnected = 2;
623                     }
624                     try {
625                         mLock.wait(timeout);
626                         // Note: mHandlerCheckers and mMonitorChecker may have changed after waiting
627                     } catch (InterruptedException e) {
628                         Log.wtf(TAG, e);
629                     }
630                     if (Debug.isDebuggerConnected()) {
631                         debuggerWasConnected = 2;
632                     }
633                     timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start);
634                 }
635 
636                 final int waitState = evaluateCheckerCompletionLocked();
637                 if (waitState == COMPLETED) {
638                     // The monitors have returned; reset
639                     waitedHalf = false;
640                     continue;
641                 } else if (waitState == WAITING) {
642                     // still waiting but within their configured intervals; back off and recheck
643                     continue;
644                 } else if (waitState == WAITED_HALF) {
645                     if (!waitedHalf) {
646                         Slog.i(TAG, "WAITED_HALF");
647                         waitedHalf = true;
648                         // We've waited half, but we'd need to do the stack trace dump w/o the lock.
649                         pids = new ArrayList<>(mInterestingJavaPids);
650                         doWaitedHalfDump = true;
651                     } else {
652                         continue;
653                     }
654                 } else {
655                     // something is overdue!
656                     blockedCheckers = getBlockedCheckersLocked();
657                     subject = describeCheckersLocked(blockedCheckers);
658                     allowRestart = mAllowRestart;
659                     pids = new ArrayList<>(mInterestingJavaPids);
660                 }
661             } // END synchronized (mLock)
662 
663             if (doWaitedHalfDump) {
664                 // We've waited half the deadlock-detection interval.  Pull a stack
665                 // trace and wait another half.
666                 ActivityManagerService.dumpStackTraces(pids, null, null,
667                         getInterestingNativePids(), null, subject);
668                 continue;
669             }
670 
671             // If we got here, that means that the system is most likely hung.
672             // First collect stack traces from all threads of the system process.
673             // Then kill this process so that the system will restart.
674             EventLog.writeEvent(EventLogTags.WATCHDOG, subject);
675 
676             final UUID errorId;
677             if (mTraceErrorLogger.isAddErrorIdEnabled()) {
678                 errorId = mTraceErrorLogger.generateErrorId();
679                 mTraceErrorLogger.addErrorIdToTrace("system_server", errorId);
680             } else {
681                 errorId = null;
682             }
683 
684             // Log the atom as early as possible since it is used as a mechanism to trigger
685             // Perfetto. Ideally, the Perfetto trace capture should happen as close to the
686             // point in time when the Watchdog happens as possible.
687             FrameworkStatsLog.write(FrameworkStatsLog.SYSTEM_SERVER_WATCHDOG_OCCURRED, subject);
688 
689             long anrTime = SystemClock.uptimeMillis();
690             StringBuilder report = new StringBuilder();
691             report.append(MemoryPressureUtil.currentPsiState());
692             ProcessCpuTracker processCpuTracker = new ProcessCpuTracker(false);
693             StringWriter tracesFileException = new StringWriter();
694             final File stack = ActivityManagerService.dumpStackTraces(
695                     pids, processCpuTracker, new SparseArray<>(), getInterestingNativePids(),
696                     tracesFileException, subject);
697 
698             // Give some extra time to make sure the stack traces get written.
699             // The system's been hanging for a minute, another second or two won't hurt much.
700             SystemClock.sleep(5000);
701 
702             processCpuTracker.update();
703             report.append(processCpuTracker.printCurrentState(anrTime));
704             report.append(tracesFileException.getBuffer());
705 
706             // Trigger the kernel to dump all blocked threads, and backtraces on all CPUs to the kernel log
707             doSysRq('w');
708             doSysRq('l');
709 
710             // Try to add the error to the dropbox, but assuming that the ActivityManager
711             // itself may be deadlocked.  (which has happened, causing this statement to
712             // deadlock and the watchdog as a whole to be ineffective)
713             Thread dropboxThread = new Thread("watchdogWriteToDropbox") {
714                     public void run() {
715                         // If a watched thread hangs before init() is called, we don't have a
716                         // valid mActivity. So we can't log the error to dropbox.
717                         if (mActivity != null) {
718                             mActivity.addErrorToDropBox(
719                                     "watchdog", null, "system_server", null, null, null,
720                                     null, report.toString(), stack, null, null, null,
721                                     errorId);
722                         }
723                     }
724                 };
725             dropboxThread.start();
726             try {
727                 dropboxThread.join(2000);  // wait up to 2 seconds for it to return.
728             } catch (InterruptedException ignored) {}
729 
730             IActivityController controller;
731             synchronized (mLock) {
732                 controller = mController;
733             }
734             if (controller != null) {
735                 Slog.i(TAG, "Reporting stuck state to activity controller");
736                 try {
737                     Binder.setDumpDisabled("Service dumps disabled due to hung system process.");
738                     // 1 = keep waiting, -1 = kill system
739                     int res = controller.systemNotResponding(subject);
740                     if (res >= 0) {
741                         Slog.i(TAG, "Activity controller requested to coninue to wait");
742                         waitedHalf = false;
743                         continue;
744                     }
745                 } catch (RemoteException e) {
746                 }
747             }
748 
749             // Only kill the process if the debugger is not attached.
750             if (Debug.isDebuggerConnected()) {
751                 debuggerWasConnected = 2;
752             }
753             if (debuggerWasConnected >= 2) {
754                 Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
755             } else if (debuggerWasConnected > 0) {
756                 Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process");
757             } else if (!allowRestart) {
758                 Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process");
759             } else {
760                 Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject);
761                 WatchdogDiagnostics.diagnoseCheckers(blockedCheckers);
762                 Slog.w(TAG, "*** GOODBYE!");
763                 if (!Build.IS_USER && isCrashLoopFound()
764                         && !WatchdogProperties.should_ignore_fatal_count().orElse(false)) {
765                     breakCrashLoop();
766                 }
767                 Process.killProcess(Process.myPid());
768                 System.exit(10);
769             }
770 
771             waitedHalf = false;
772         }
773     }
774 
doSysRq(char c)775     private void doSysRq(char c) {
776         try {
777             FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger");
778             sysrq_trigger.write(c);
779             sysrq_trigger.close();
780         } catch (IOException e) {
781             Slog.w(TAG, "Failed to write to /proc/sysrq-trigger", e);
782         }
783     }
784 
resetTimeoutHistory()785     private void resetTimeoutHistory() {
786         writeTimeoutHistory(new ArrayList<String>());
787     }
788 
writeTimeoutHistory(Iterable<String> crashHistory)789     private void writeTimeoutHistory(Iterable<String> crashHistory) {
790         String data = String.join(",", crashHistory);
791 
792         try (FileWriter writer = new FileWriter(TIMEOUT_HISTORY_FILE)) {
793             writer.write(SystemProperties.get("ro.boottime.zygote"));
794             writer.write(":");
795             writer.write(data);
796         } catch (IOException e) {
797             Slog.e(TAG, "Failed to write file " + TIMEOUT_HISTORY_FILE, e);
798         }
799     }
800 
readTimeoutHistory()801     private String[] readTimeoutHistory() {
802         final String[] emptyStringArray = {};
803 
804         try (BufferedReader reader = new BufferedReader(new FileReader(TIMEOUT_HISTORY_FILE))) {
805             String line = reader.readLine();
806             if (line == null) {
807                 return emptyStringArray;
808             }
809 
810             String[] data = line.trim().split(":");
811             String boottime = data.length >= 1 ? data[0] : "";
812             String history = data.length >= 2 ? data[1] : "";
813             if (SystemProperties.get("ro.boottime.zygote").equals(boottime) && !history.isEmpty()) {
814                 return history.split(",");
815             } else {
816                 return emptyStringArray;
817             }
818         } catch (FileNotFoundException e) {
819             return emptyStringArray;
820         } catch (IOException e) {
821             Slog.e(TAG, "Failed to read file " + TIMEOUT_HISTORY_FILE, e);
822             return emptyStringArray;
823         }
824     }
825 
hasActiveUsbConnection()826     private boolean hasActiveUsbConnection() {
827         try {
828             final String state = FileUtils.readTextFile(
829                     new File("/sys/class/android_usb/android0/state"),
830                     128 /*max*/, null /*ellipsis*/).trim();
831             if ("CONFIGURED".equals(state)) {
832                 return true;
833             }
834         } catch (IOException e) {
835             Slog.w(TAG, "Failed to determine if device was on USB", e);
836         }
837         return false;
838     }
839 
isCrashLoopFound()840     private boolean isCrashLoopFound() {
841         int fatalCount = WatchdogProperties.fatal_count().orElse(0);
842         long fatalWindowMs = TimeUnit.SECONDS.toMillis(
843                 WatchdogProperties.fatal_window_seconds().orElse(0));
844         if (fatalCount == 0 || fatalWindowMs == 0) {
845             if (fatalCount != fatalWindowMs) {
846                 Slog.w(TAG, String.format("sysprops '%s' and '%s' should be set or unset together",
847                             PROP_FATAL_LOOP_COUNT, PROP_FATAL_LOOP_WINDOWS_SECS));
848             }
849             return false;
850         }
851 
852         // new-history = [last (fatalCount - 1) items in old-history] + [nowMs].
853         long nowMs = SystemClock.elapsedRealtime(); // Time since boot including deep sleep.
854         String[] rawCrashHistory = readTimeoutHistory();
855         ArrayList<String> crashHistory = new ArrayList<String>(Arrays.asList(Arrays.copyOfRange(
856                         rawCrashHistory,
857                         Math.max(0, rawCrashHistory.length - fatalCount - 1),
858                         rawCrashHistory.length)));
859         // Something wrong here.
860         crashHistory.add(String.valueOf(nowMs));
861         writeTimeoutHistory(crashHistory);
862 
863         // Returns false if the device has an active USB connection.
864         if (hasActiveUsbConnection()) {
865             return false;
866         }
867 
868         long firstCrashMs;
869         try {
870             firstCrashMs = Long.parseLong(crashHistory.get(0));
871         } catch (NumberFormatException t) {
872             Slog.w(TAG, "Failed to parseLong " + crashHistory.get(0), t);
873             resetTimeoutHistory();
874             return false;
875         }
876         return crashHistory.size() >= fatalCount && nowMs - firstCrashMs < fatalWindowMs;
877     }
878 
breakCrashLoop()879     private void breakCrashLoop() {
880         try (FileWriter kmsg = new FileWriter("/dev/kmsg_debug", /* append= */ true)) {
881             kmsg.append("Fatal reset to escape the system_server crashing loop\n");
882         } catch (IOException e) {
883             Slog.w(TAG, "Failed to append to kmsg", e);
884         }
885         doSysRq('c');
886     }
887 }
888