• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2008 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.android.server;
18 
19 import android.app.IActivityController;
20 import android.os.Binder;
21 import android.os.RemoteException;
22 import com.android.server.am.ActivityManagerService;
23 import com.android.server.power.PowerManagerService;
24 
25 import android.app.AlarmManager;
26 import android.app.PendingIntent;
27 import android.content.BroadcastReceiver;
28 import android.content.ContentResolver;
29 import android.content.Context;
30 import android.content.Intent;
31 import android.content.IntentFilter;
32 import android.os.BatteryManager;
33 import android.os.Debug;
34 import android.os.Handler;
35 import android.os.Looper;
36 import android.os.Process;
37 import android.os.ServiceManager;
38 import android.os.SystemClock;
39 import android.os.SystemProperties;
40 import android.util.EventLog;
41 import android.util.Log;
42 import android.util.Slog;
43 
44 import java.io.File;
45 import java.io.FileWriter;
46 import java.io.IOException;
47 import java.util.ArrayList;
48 import java.util.Calendar;
49 
50 /** This class calls its monitor every minute. Killing this process if they don't return **/
51 public class Watchdog extends Thread {
52     static final String TAG = "Watchdog";
53     static final boolean localLOGV = false || false;
54 
55     // Set this to true to use debug default values.
56     static final boolean DB = false;
57 
58     // Set this to true to have the watchdog record kernel thread stacks when it fires
59     static final boolean RECORD_KERNEL_THREADS = true;
60 
61     static final long DEFAULT_TIMEOUT = DB ? 10*1000 : 60*1000;
62     static final long CHECK_INTERVAL = DEFAULT_TIMEOUT / 2;
63 
64     // These are temporally ordered: larger values as lateness increases
65     static final int COMPLETED = 0;
66     static final int WAITING = 1;
67     static final int WAITED_HALF = 2;
68     static final int OVERDUE = 3;
69 
70     // Which native processes to dump into dropbox's stack traces
71     public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] {
72         "/system/bin/mediaserver",
73         "/system/bin/sdcard",
74         "/system/bin/surfaceflinger"
75     };
76 
77     static Watchdog sWatchdog;
78 
79     /* This handler will be used to post message back onto the main thread */
80     final ArrayList<HandlerChecker> mHandlerCheckers = new ArrayList<HandlerChecker>();
81     final HandlerChecker mMonitorChecker;
82     ContentResolver mResolver;
83     BatteryService mBattery;
84     PowerManagerService mPower;
85     AlarmManagerService mAlarm;
86     ActivityManagerService mActivity;
87 
88     int mPhonePid;
89     IActivityController mController;
90     boolean mAllowRestart = true;
91 
92     /**
93      * Used for checking status of handle threads and scheduling monitor callbacks.
94      */
95     public final class HandlerChecker implements Runnable {
96         private final Handler mHandler;
97         private final String mName;
98         private final long mWaitMax;
99         private final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>();
100         private boolean mCompleted;
101         private Monitor mCurrentMonitor;
102         private long mStartTime;
103 
HandlerChecker(Handler handler, String name, long waitMaxMillis)104         HandlerChecker(Handler handler, String name, long waitMaxMillis) {
105             mHandler = handler;
106             mName = name;
107             mWaitMax = waitMaxMillis;
108             mCompleted = true;
109         }
110 
addMonitor(Monitor monitor)111         public void addMonitor(Monitor monitor) {
112             mMonitors.add(monitor);
113         }
114 
scheduleCheckLocked()115         public void scheduleCheckLocked() {
116             if (mMonitors.size() == 0 && mHandler.getLooper().isIdling()) {
117                 // If the target looper is or just recently was idling, then
118                 // there is no reason to enqueue our checker on it since that
119                 // is as good as it not being deadlocked.  This avoid having
120                 // to do a context switch to check the thread.  Note that we
121                 // only do this if mCheckReboot is false and we have no
122                 // monitors, since those would need to be executed at this point.
123                 mCompleted = true;
124                 return;
125             }
126 
127             if (!mCompleted) {
128                 // we already have a check in flight, so no need
129                 return;
130             }
131 
132             mCompleted = false;
133             mCurrentMonitor = null;
134             mStartTime = SystemClock.uptimeMillis();
135             mHandler.postAtFrontOfQueue(this);
136         }
137 
isOverdueLocked()138         public boolean isOverdueLocked() {
139             return (!mCompleted) && (SystemClock.uptimeMillis() > mStartTime + mWaitMax);
140         }
141 
getCompletionStateLocked()142         public int getCompletionStateLocked() {
143             if (mCompleted) {
144                 return COMPLETED;
145             } else {
146                 long latency = SystemClock.uptimeMillis() - mStartTime;
147                 if (latency < mWaitMax/2) {
148                     return WAITING;
149                 } else if (latency < mWaitMax) {
150                     return WAITED_HALF;
151                 }
152             }
153             return OVERDUE;
154         }
155 
getThread()156         public Thread getThread() {
157             return mHandler.getLooper().getThread();
158         }
159 
getName()160         public String getName() {
161             return mName;
162         }
163 
describeBlockedStateLocked()164         public String describeBlockedStateLocked() {
165             if (mCurrentMonitor == null) {
166                 return "Blocked in handler on " + mName + " (" + getThread().getName() + ")";
167             } else {
168                 return "Blocked in monitor " + mCurrentMonitor.getClass().getName()
169                         + " on " + mName + " (" + getThread().getName() + ")";
170             }
171         }
172 
173         @Override
run()174         public void run() {
175             final int size = mMonitors.size();
176             for (int i = 0 ; i < size ; i++) {
177                 synchronized (Watchdog.this) {
178                     mCurrentMonitor = mMonitors.get(i);
179                 }
180                 mCurrentMonitor.monitor();
181             }
182 
183             synchronized (Watchdog.this) {
184                 mCompleted = true;
185                 mCurrentMonitor = null;
186             }
187         }
188     }
189 
190     final class RebootRequestReceiver extends BroadcastReceiver {
191         @Override
onReceive(Context c, Intent intent)192         public void onReceive(Context c, Intent intent) {
193             if (intent.getIntExtra("nowait", 0) != 0) {
194                 rebootSystem("Received ACTION_REBOOT broadcast");
195                 return;
196             }
197             Slog.w(TAG, "Unsupported ACTION_REBOOT broadcast: " + intent);
198         }
199     }
200 
201     public interface Monitor {
monitor()202         void monitor();
203     }
204 
getInstance()205     public static Watchdog getInstance() {
206         if (sWatchdog == null) {
207             sWatchdog = new Watchdog();
208         }
209 
210         return sWatchdog;
211     }
212 
Watchdog()213     private Watchdog() {
214         super("watchdog");
215         // Initialize handler checkers for each common thread we want to check.  Note
216         // that we are not currently checking the background thread, since it can
217         // potentially hold longer running operations with no guarantees about the timeliness
218         // of operations there.
219 
220         // The shared foreground thread is the main checker.  It is where we
221         // will also dispatch monitor checks and do other work.
222         mMonitorChecker = new HandlerChecker(FgThread.getHandler(),
223                 "foreground thread", DEFAULT_TIMEOUT);
224         mHandlerCheckers.add(mMonitorChecker);
225         // Add checker for main thread.  We only do a quick check since there
226         // can be UI running on the thread.
227         mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()),
228                 "main thread", DEFAULT_TIMEOUT));
229         // Add checker for shared UI thread.
230         mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(),
231                 "ui thread", DEFAULT_TIMEOUT));
232         // And also check IO thread.
233         mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(),
234                 "i/o thread", DEFAULT_TIMEOUT));
235     }
236 
init(Context context, BatteryService battery, PowerManagerService power, AlarmManagerService alarm, ActivityManagerService activity)237     public void init(Context context, BatteryService battery,
238             PowerManagerService power, AlarmManagerService alarm,
239             ActivityManagerService activity) {
240         mResolver = context.getContentResolver();
241         mBattery = battery;
242         mPower = power;
243         mAlarm = alarm;
244         mActivity = activity;
245 
246         context.registerReceiver(new RebootRequestReceiver(),
247                 new IntentFilter(Intent.ACTION_REBOOT),
248                 android.Manifest.permission.REBOOT, null);
249     }
250 
processStarted(String name, int pid)251     public void processStarted(String name, int pid) {
252         synchronized (this) {
253             if ("com.android.phone".equals(name)) {
254                 mPhonePid = pid;
255             }
256         }
257     }
258 
setActivityController(IActivityController controller)259     public void setActivityController(IActivityController controller) {
260         synchronized (this) {
261             mController = controller;
262         }
263     }
264 
setAllowRestart(boolean allowRestart)265     public void setAllowRestart(boolean allowRestart) {
266         synchronized (this) {
267             mAllowRestart = allowRestart;
268         }
269     }
270 
addMonitor(Monitor monitor)271     public void addMonitor(Monitor monitor) {
272         synchronized (this) {
273             if (isAlive()) {
274                 throw new RuntimeException("Monitors can't be added once the Watchdog is running");
275             }
276             mMonitorChecker.addMonitor(monitor);
277         }
278     }
279 
addThread(Handler thread, String name)280     public void addThread(Handler thread, String name) {
281         addThread(thread, name, DEFAULT_TIMEOUT);
282     }
283 
addThread(Handler thread, String name, long timeoutMillis)284     public void addThread(Handler thread, String name, long timeoutMillis) {
285         synchronized (this) {
286             if (isAlive()) {
287                 throw new RuntimeException("Threads can't be added once the Watchdog is running");
288             }
289             mHandlerCheckers.add(new HandlerChecker(thread, name, timeoutMillis));
290         }
291     }
292 
293     /**
294      * Perform a full reboot of the system.
295      */
rebootSystem(String reason)296     void rebootSystem(String reason) {
297         Slog.i(TAG, "Rebooting system because: " + reason);
298         PowerManagerService pms = (PowerManagerService) ServiceManager.getService("power");
299         pms.reboot(false, reason, false);
300     }
301 
evaluateCheckerCompletionLocked()302     private int evaluateCheckerCompletionLocked() {
303         int state = COMPLETED;
304         for (int i=0; i<mHandlerCheckers.size(); i++) {
305             HandlerChecker hc = mHandlerCheckers.get(i);
306             state = Math.max(state, hc.getCompletionStateLocked());
307         }
308         return state;
309     }
310 
getBlockedCheckersLocked()311     private ArrayList<HandlerChecker> getBlockedCheckersLocked() {
312         ArrayList<HandlerChecker> checkers = new ArrayList<HandlerChecker>();
313         for (int i=0; i<mHandlerCheckers.size(); i++) {
314             HandlerChecker hc = mHandlerCheckers.get(i);
315             if (hc.isOverdueLocked()) {
316                 checkers.add(hc);
317             }
318         }
319         return checkers;
320     }
321 
describeCheckersLocked(ArrayList<HandlerChecker> checkers)322     private String describeCheckersLocked(ArrayList<HandlerChecker> checkers) {
323         StringBuilder builder = new StringBuilder(128);
324         for (int i=0; i<checkers.size(); i++) {
325             if (builder.length() > 0) {
326                 builder.append(", ");
327             }
328             builder.append(checkers.get(i).describeBlockedStateLocked());
329         }
330         return builder.toString();
331     }
332 
333     @Override
run()334     public void run() {
335         boolean waitedHalf = false;
336         while (true) {
337             final ArrayList<HandlerChecker> blockedCheckers;
338             final String subject;
339             final boolean allowRestart;
340             synchronized (this) {
341                 long timeout = CHECK_INTERVAL;
342                 // Make sure we (re)spin the checkers that have become idle within
343                 // this wait-and-check interval
344                 for (int i=0; i<mHandlerCheckers.size(); i++) {
345                     HandlerChecker hc = mHandlerCheckers.get(i);
346                     hc.scheduleCheckLocked();
347                 }
348 
349                 // NOTE: We use uptimeMillis() here because we do not want to increment the time we
350                 // wait while asleep. If the device is asleep then the thing that we are waiting
351                 // to timeout on is asleep as well and won't have a chance to run, causing a false
352                 // positive on when to kill things.
353                 long start = SystemClock.uptimeMillis();
354                 while (timeout > 0) {
355                     try {
356                         wait(timeout);
357                     } catch (InterruptedException e) {
358                         Log.wtf(TAG, e);
359                     }
360                     timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start);
361                 }
362 
363                 final int waitState = evaluateCheckerCompletionLocked();
364                 if (waitState == COMPLETED) {
365                     // The monitors have returned; reset
366                     waitedHalf = false;
367                     continue;
368                 } else if (waitState == WAITING) {
369                     // still waiting but within their configured intervals; back off and recheck
370                     continue;
371                 } else if (waitState == WAITED_HALF) {
372                     if (!waitedHalf) {
373                         // We've waited half the deadlock-detection interval.  Pull a stack
374                         // trace and wait another half.
375                         ArrayList<Integer> pids = new ArrayList<Integer>();
376                         pids.add(Process.myPid());
377                         ActivityManagerService.dumpStackTraces(true, pids, null, null,
378                                 NATIVE_STACKS_OF_INTEREST);
379                         waitedHalf = true;
380                     }
381                     continue;
382                 }
383 
384                 // something is overdue!
385                 blockedCheckers = getBlockedCheckersLocked();
386                 subject = describeCheckersLocked(blockedCheckers);
387                 allowRestart = mAllowRestart;
388             }
389 
390             // If we got here, that means that the system is most likely hung.
391             // First collect stack traces from all threads of the system process.
392             // Then kill this process so that the system will restart.
393             EventLog.writeEvent(EventLogTags.WATCHDOG, subject);
394 
395             ArrayList<Integer> pids = new ArrayList<Integer>();
396             pids.add(Process.myPid());
397             if (mPhonePid > 0) pids.add(mPhonePid);
398             // Pass !waitedHalf so that just in case we somehow wind up here without having
399             // dumped the halfway stacks, we properly re-initialize the trace file.
400             final File stack = ActivityManagerService.dumpStackTraces(
401                     !waitedHalf, pids, null, null, NATIVE_STACKS_OF_INTEREST);
402 
403             // Give some extra time to make sure the stack traces get written.
404             // The system's been hanging for a minute, another second or two won't hurt much.
405             SystemClock.sleep(2000);
406 
407             // Pull our own kernel thread stacks as well if we're configured for that
408             if (RECORD_KERNEL_THREADS) {
409                 dumpKernelStackTraces();
410             }
411 
412             // Trigger the kernel to dump all blocked threads to the kernel log
413             try {
414                 FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger");
415                 sysrq_trigger.write("w");
416                 sysrq_trigger.close();
417             } catch (IOException e) {
418                 Slog.e(TAG, "Failed to write to /proc/sysrq-trigger");
419                 Slog.e(TAG, e.getMessage());
420             }
421 
422             // Try to add the error to the dropbox, but assuming that the ActivityManager
423             // itself may be deadlocked.  (which has happened, causing this statement to
424             // deadlock and the watchdog as a whole to be ineffective)
425             Thread dropboxThread = new Thread("watchdogWriteToDropbox") {
426                     public void run() {
427                         mActivity.addErrorToDropBox(
428                                 "watchdog", null, "system_server", null, null,
429                                 subject, null, stack, null);
430                     }
431                 };
432             dropboxThread.start();
433             try {
434                 dropboxThread.join(2000);  // wait up to 2 seconds for it to return.
435             } catch (InterruptedException ignored) {}
436 
437             IActivityController controller;
438             synchronized (this) {
439                 controller = mController;
440             }
441             if (controller != null) {
442                 Slog.i(TAG, "Reporting stuck state to activity controller");
443                 try {
444                     Binder.setDumpDisabled("Service dumps disabled due to hung system process.");
445                     // 1 = keep waiting, -1 = kill system
446                     int res = controller.systemNotResponding(subject);
447                     if (res >= 0) {
448                         Slog.i(TAG, "Activity controller requested to coninue to wait");
449                         waitedHalf = false;
450                         continue;
451                     }
452                 } catch (RemoteException e) {
453                 }
454             }
455 
456             // Only kill the process if the debugger is not attached.
457             if (Debug.isDebuggerConnected()) {
458                 Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
459             } else if (!allowRestart) {
460                 Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process");
461             } else {
462                 Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject);
463                 for (int i=0; i<blockedCheckers.size(); i++) {
464                     Slog.w(TAG, blockedCheckers.get(i).getName() + " stack trace:");
465                     StackTraceElement[] stackTrace
466                             = blockedCheckers.get(i).getThread().getStackTrace();
467                     for (StackTraceElement element: stackTrace) {
468                         Slog.w(TAG, "    at " + element);
469                     }
470                 }
471                 Slog.w(TAG, "*** GOODBYE!");
472                 Process.killProcess(Process.myPid());
473                 System.exit(10);
474             }
475 
476             waitedHalf = false;
477         }
478     }
479 
dumpKernelStackTraces()480     private File dumpKernelStackTraces() {
481         String tracesPath = SystemProperties.get("dalvik.vm.stack-trace-file", null);
482         if (tracesPath == null || tracesPath.length() == 0) {
483             return null;
484         }
485 
486         native_dumpKernelStacks(tracesPath);
487         return new File(tracesPath);
488     }
489 
native_dumpKernelStacks(String tracesPath)490     private native void native_dumpKernelStacks(String tracesPath);
491 }
492