1 /* 2 * Copyright (C) 2008 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.server; 18 19 import android.app.IActivityController; 20 import android.os.Binder; 21 import android.os.RemoteException; 22 import com.android.server.am.ActivityManagerService; 23 24 import android.content.BroadcastReceiver; 25 import android.content.ContentResolver; 26 import android.content.Context; 27 import android.content.Intent; 28 import android.content.IntentFilter; 29 import android.os.Debug; 30 import android.os.Handler; 31 import android.os.IPowerManager; 32 import android.os.Looper; 33 import android.os.Process; 34 import android.os.ServiceManager; 35 import android.os.SystemClock; 36 import android.os.SystemProperties; 37 import android.util.EventLog; 38 import android.util.Log; 39 import android.util.Slog; 40 41 import java.io.File; 42 import java.io.FileWriter; 43 import java.io.IOException; 44 import java.util.ArrayList; 45 46 /** This class calls its monitor every minute. Killing this process if they don't return **/ 47 public class Watchdog extends Thread { 48 static final String TAG = "Watchdog"; 49 50 // Set this to true to use debug default values. 51 static final boolean DB = false; 52 53 // Set this to true to have the watchdog record kernel thread stacks when it fires 54 static final boolean RECORD_KERNEL_THREADS = true; 55 56 static final long DEFAULT_TIMEOUT = DB ? 10*1000 : 60*1000; 57 static final long CHECK_INTERVAL = DEFAULT_TIMEOUT / 2; 58 59 // These are temporally ordered: larger values as lateness increases 60 static final int COMPLETED = 0; 61 static final int WAITING = 1; 62 static final int WAITED_HALF = 2; 63 static final int OVERDUE = 3; 64 65 // Which native processes to dump into dropbox's stack traces 66 public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] { 67 "/system/bin/audioserver", 68 "/system/bin/cameraserver", 69 "/system/bin/drmserver", 70 "/system/bin/mediadrmserver", 71 "/system/bin/mediaserver", 72 "/system/bin/sdcard", 73 "/system/bin/surfaceflinger", 74 "media.codec", // system/bin/mediacodec 75 "media.extractor", // system/bin/mediaextractor 76 "com.android.bluetooth", // Bluetooth service 77 }; 78 79 static Watchdog sWatchdog; 80 81 /* This handler will be used to post message back onto the main thread */ 82 final ArrayList<HandlerChecker> mHandlerCheckers = new ArrayList<>(); 83 final HandlerChecker mMonitorChecker; 84 ContentResolver mResolver; 85 ActivityManagerService mActivity; 86 87 int mPhonePid; 88 IActivityController mController; 89 boolean mAllowRestart = true; 90 91 /** 92 * Used for checking status of handle threads and scheduling monitor callbacks. 93 */ 94 public final class HandlerChecker implements Runnable { 95 private final Handler mHandler; 96 private final String mName; 97 private final long mWaitMax; 98 private final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>(); 99 private boolean mCompleted; 100 private Monitor mCurrentMonitor; 101 private long mStartTime; 102 HandlerChecker(Handler handler, String name, long waitMaxMillis)103 HandlerChecker(Handler handler, String name, long waitMaxMillis) { 104 mHandler = handler; 105 mName = name; 106 mWaitMax = waitMaxMillis; 107 mCompleted = true; 108 } 109 addMonitor(Monitor monitor)110 public void addMonitor(Monitor monitor) { 111 mMonitors.add(monitor); 112 } 113 scheduleCheckLocked()114 public void scheduleCheckLocked() { 115 if (mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling()) { 116 // If the target looper has recently been polling, then 117 // there is no reason to enqueue our checker on it since that 118 // is as good as it not being deadlocked. This avoid having 119 // to do a context switch to check the thread. Note that we 120 // only do this if mCheckReboot is false and we have no 121 // monitors, since those would need to be executed at this point. 122 mCompleted = true; 123 return; 124 } 125 126 if (!mCompleted) { 127 // we already have a check in flight, so no need 128 return; 129 } 130 131 mCompleted = false; 132 mCurrentMonitor = null; 133 mStartTime = SystemClock.uptimeMillis(); 134 mHandler.postAtFrontOfQueue(this); 135 } 136 isOverdueLocked()137 public boolean isOverdueLocked() { 138 return (!mCompleted) && (SystemClock.uptimeMillis() > mStartTime + mWaitMax); 139 } 140 getCompletionStateLocked()141 public int getCompletionStateLocked() { 142 if (mCompleted) { 143 return COMPLETED; 144 } else { 145 long latency = SystemClock.uptimeMillis() - mStartTime; 146 if (latency < mWaitMax/2) { 147 return WAITING; 148 } else if (latency < mWaitMax) { 149 return WAITED_HALF; 150 } 151 } 152 return OVERDUE; 153 } 154 getThread()155 public Thread getThread() { 156 return mHandler.getLooper().getThread(); 157 } 158 getName()159 public String getName() { 160 return mName; 161 } 162 describeBlockedStateLocked()163 public String describeBlockedStateLocked() { 164 if (mCurrentMonitor == null) { 165 return "Blocked in handler on " + mName + " (" + getThread().getName() + ")"; 166 } else { 167 return "Blocked in monitor " + mCurrentMonitor.getClass().getName() 168 + " on " + mName + " (" + getThread().getName() + ")"; 169 } 170 } 171 172 @Override run()173 public void run() { 174 final int size = mMonitors.size(); 175 for (int i = 0 ; i < size ; i++) { 176 synchronized (Watchdog.this) { 177 mCurrentMonitor = mMonitors.get(i); 178 } 179 mCurrentMonitor.monitor(); 180 } 181 182 synchronized (Watchdog.this) { 183 mCompleted = true; 184 mCurrentMonitor = null; 185 } 186 } 187 } 188 189 final class RebootRequestReceiver extends BroadcastReceiver { 190 @Override onReceive(Context c, Intent intent)191 public void onReceive(Context c, Intent intent) { 192 if (intent.getIntExtra("nowait", 0) != 0) { 193 rebootSystem("Received ACTION_REBOOT broadcast"); 194 return; 195 } 196 Slog.w(TAG, "Unsupported ACTION_REBOOT broadcast: " + intent); 197 } 198 } 199 200 /** Monitor for checking the availability of binder threads. The monitor will block until 201 * there is a binder thread available to process in coming IPCs to make sure other processes 202 * can still communicate with the service. 203 */ 204 private static final class BinderThreadMonitor implements Watchdog.Monitor { 205 @Override monitor()206 public void monitor() { 207 Binder.blockUntilThreadAvailable(); 208 } 209 } 210 211 public interface Monitor { monitor()212 void monitor(); 213 } 214 getInstance()215 public static Watchdog getInstance() { 216 if (sWatchdog == null) { 217 sWatchdog = new Watchdog(); 218 } 219 220 return sWatchdog; 221 } 222 Watchdog()223 private Watchdog() { 224 super("watchdog"); 225 // Initialize handler checkers for each common thread we want to check. Note 226 // that we are not currently checking the background thread, since it can 227 // potentially hold longer running operations with no guarantees about the timeliness 228 // of operations there. 229 230 // The shared foreground thread is the main checker. It is where we 231 // will also dispatch monitor checks and do other work. 232 mMonitorChecker = new HandlerChecker(FgThread.getHandler(), 233 "foreground thread", DEFAULT_TIMEOUT); 234 mHandlerCheckers.add(mMonitorChecker); 235 // Add checker for main thread. We only do a quick check since there 236 // can be UI running on the thread. 237 mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()), 238 "main thread", DEFAULT_TIMEOUT)); 239 // Add checker for shared UI thread. 240 mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(), 241 "ui thread", DEFAULT_TIMEOUT)); 242 // And also check IO thread. 243 mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(), 244 "i/o thread", DEFAULT_TIMEOUT)); 245 // And the display thread. 246 mHandlerCheckers.add(new HandlerChecker(DisplayThread.getHandler(), 247 "display thread", DEFAULT_TIMEOUT)); 248 249 // Initialize monitor for Binder threads. 250 addMonitor(new BinderThreadMonitor()); 251 } 252 init(Context context, ActivityManagerService activity)253 public void init(Context context, ActivityManagerService activity) { 254 mResolver = context.getContentResolver(); 255 mActivity = activity; 256 257 context.registerReceiver(new RebootRequestReceiver(), 258 new IntentFilter(Intent.ACTION_REBOOT), 259 android.Manifest.permission.REBOOT, null); 260 } 261 processStarted(String name, int pid)262 public void processStarted(String name, int pid) { 263 synchronized (this) { 264 if ("com.android.phone".equals(name)) { 265 mPhonePid = pid; 266 } 267 } 268 } 269 setActivityController(IActivityController controller)270 public void setActivityController(IActivityController controller) { 271 synchronized (this) { 272 mController = controller; 273 } 274 } 275 setAllowRestart(boolean allowRestart)276 public void setAllowRestart(boolean allowRestart) { 277 synchronized (this) { 278 mAllowRestart = allowRestart; 279 } 280 } 281 addMonitor(Monitor monitor)282 public void addMonitor(Monitor monitor) { 283 synchronized (this) { 284 if (isAlive()) { 285 throw new RuntimeException("Monitors can't be added once the Watchdog is running"); 286 } 287 mMonitorChecker.addMonitor(monitor); 288 } 289 } 290 addThread(Handler thread)291 public void addThread(Handler thread) { 292 addThread(thread, DEFAULT_TIMEOUT); 293 } 294 addThread(Handler thread, long timeoutMillis)295 public void addThread(Handler thread, long timeoutMillis) { 296 synchronized (this) { 297 if (isAlive()) { 298 throw new RuntimeException("Threads can't be added once the Watchdog is running"); 299 } 300 final String name = thread.getLooper().getThread().getName(); 301 mHandlerCheckers.add(new HandlerChecker(thread, name, timeoutMillis)); 302 } 303 } 304 305 /** 306 * Perform a full reboot of the system. 307 */ rebootSystem(String reason)308 void rebootSystem(String reason) { 309 Slog.i(TAG, "Rebooting system because: " + reason); 310 IPowerManager pms = (IPowerManager)ServiceManager.getService(Context.POWER_SERVICE); 311 try { 312 pms.reboot(false, reason, false); 313 } catch (RemoteException ex) { 314 } 315 } 316 evaluateCheckerCompletionLocked()317 private int evaluateCheckerCompletionLocked() { 318 int state = COMPLETED; 319 for (int i=0; i<mHandlerCheckers.size(); i++) { 320 HandlerChecker hc = mHandlerCheckers.get(i); 321 state = Math.max(state, hc.getCompletionStateLocked()); 322 } 323 return state; 324 } 325 getBlockedCheckersLocked()326 private ArrayList<HandlerChecker> getBlockedCheckersLocked() { 327 ArrayList<HandlerChecker> checkers = new ArrayList<HandlerChecker>(); 328 for (int i=0; i<mHandlerCheckers.size(); i++) { 329 HandlerChecker hc = mHandlerCheckers.get(i); 330 if (hc.isOverdueLocked()) { 331 checkers.add(hc); 332 } 333 } 334 return checkers; 335 } 336 describeCheckersLocked(ArrayList<HandlerChecker> checkers)337 private String describeCheckersLocked(ArrayList<HandlerChecker> checkers) { 338 StringBuilder builder = new StringBuilder(128); 339 for (int i=0; i<checkers.size(); i++) { 340 if (builder.length() > 0) { 341 builder.append(", "); 342 } 343 builder.append(checkers.get(i).describeBlockedStateLocked()); 344 } 345 return builder.toString(); 346 } 347 348 @Override run()349 public void run() { 350 boolean waitedHalf = false; 351 while (true) { 352 final ArrayList<HandlerChecker> blockedCheckers; 353 final String subject; 354 final boolean allowRestart; 355 int debuggerWasConnected = 0; 356 synchronized (this) { 357 long timeout = CHECK_INTERVAL; 358 // Make sure we (re)spin the checkers that have become idle within 359 // this wait-and-check interval 360 for (int i=0; i<mHandlerCheckers.size(); i++) { 361 HandlerChecker hc = mHandlerCheckers.get(i); 362 hc.scheduleCheckLocked(); 363 } 364 365 if (debuggerWasConnected > 0) { 366 debuggerWasConnected--; 367 } 368 369 // NOTE: We use uptimeMillis() here because we do not want to increment the time we 370 // wait while asleep. If the device is asleep then the thing that we are waiting 371 // to timeout on is asleep as well and won't have a chance to run, causing a false 372 // positive on when to kill things. 373 long start = SystemClock.uptimeMillis(); 374 while (timeout > 0) { 375 if (Debug.isDebuggerConnected()) { 376 debuggerWasConnected = 2; 377 } 378 try { 379 wait(timeout); 380 } catch (InterruptedException e) { 381 Log.wtf(TAG, e); 382 } 383 if (Debug.isDebuggerConnected()) { 384 debuggerWasConnected = 2; 385 } 386 timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start); 387 } 388 389 final int waitState = evaluateCheckerCompletionLocked(); 390 if (waitState == COMPLETED) { 391 // The monitors have returned; reset 392 waitedHalf = false; 393 continue; 394 } else if (waitState == WAITING) { 395 // still waiting but within their configured intervals; back off and recheck 396 continue; 397 } else if (waitState == WAITED_HALF) { 398 if (!waitedHalf) { 399 // We've waited half the deadlock-detection interval. Pull a stack 400 // trace and wait another half. 401 ArrayList<Integer> pids = new ArrayList<Integer>(); 402 pids.add(Process.myPid()); 403 ActivityManagerService.dumpStackTraces(true, pids, null, null, 404 NATIVE_STACKS_OF_INTEREST); 405 waitedHalf = true; 406 } 407 continue; 408 } 409 410 // something is overdue! 411 blockedCheckers = getBlockedCheckersLocked(); 412 subject = describeCheckersLocked(blockedCheckers); 413 allowRestart = mAllowRestart; 414 } 415 416 // If we got here, that means that the system is most likely hung. 417 // First collect stack traces from all threads of the system process. 418 // Then kill this process so that the system will restart. 419 EventLog.writeEvent(EventLogTags.WATCHDOG, subject); 420 421 ArrayList<Integer> pids = new ArrayList<Integer>(); 422 pids.add(Process.myPid()); 423 if (mPhonePid > 0) pids.add(mPhonePid); 424 // Pass !waitedHalf so that just in case we somehow wind up here without having 425 // dumped the halfway stacks, we properly re-initialize the trace file. 426 final File stack = ActivityManagerService.dumpStackTraces( 427 !waitedHalf, pids, null, null, NATIVE_STACKS_OF_INTEREST); 428 429 // Give some extra time to make sure the stack traces get written. 430 // The system's been hanging for a minute, another second or two won't hurt much. 431 SystemClock.sleep(2000); 432 433 // Pull our own kernel thread stacks as well if we're configured for that 434 if (RECORD_KERNEL_THREADS) { 435 dumpKernelStackTraces(); 436 } 437 438 // Trigger the kernel to dump all blocked threads, and backtraces on all CPUs to the kernel log 439 doSysRq('w'); 440 doSysRq('l'); 441 442 // Try to add the error to the dropbox, but assuming that the ActivityManager 443 // itself may be deadlocked. (which has happened, causing this statement to 444 // deadlock and the watchdog as a whole to be ineffective) 445 Thread dropboxThread = new Thread("watchdogWriteToDropbox") { 446 public void run() { 447 mActivity.addErrorToDropBox( 448 "watchdog", null, "system_server", null, null, 449 subject, null, stack, null); 450 } 451 }; 452 dropboxThread.start(); 453 try { 454 dropboxThread.join(2000); // wait up to 2 seconds for it to return. 455 } catch (InterruptedException ignored) {} 456 457 IActivityController controller; 458 synchronized (this) { 459 controller = mController; 460 } 461 if (controller != null) { 462 Slog.i(TAG, "Reporting stuck state to activity controller"); 463 try { 464 Binder.setDumpDisabled("Service dumps disabled due to hung system process."); 465 // 1 = keep waiting, -1 = kill system 466 int res = controller.systemNotResponding(subject); 467 if (res >= 0) { 468 Slog.i(TAG, "Activity controller requested to coninue to wait"); 469 waitedHalf = false; 470 continue; 471 } 472 } catch (RemoteException e) { 473 } 474 } 475 476 // Only kill the process if the debugger is not attached. 477 if (Debug.isDebuggerConnected()) { 478 debuggerWasConnected = 2; 479 } 480 if (debuggerWasConnected >= 2) { 481 Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process"); 482 } else if (debuggerWasConnected > 0) { 483 Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process"); 484 } else if (!allowRestart) { 485 Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process"); 486 } else { 487 Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject); 488 for (int i=0; i<blockedCheckers.size(); i++) { 489 Slog.w(TAG, blockedCheckers.get(i).getName() + " stack trace:"); 490 StackTraceElement[] stackTrace 491 = blockedCheckers.get(i).getThread().getStackTrace(); 492 for (StackTraceElement element: stackTrace) { 493 Slog.w(TAG, " at " + element); 494 } 495 } 496 Slog.w(TAG, "*** GOODBYE!"); 497 Process.killProcess(Process.myPid()); 498 System.exit(10); 499 } 500 501 waitedHalf = false; 502 } 503 } 504 doSysRq(char c)505 private void doSysRq(char c) { 506 try { 507 FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger"); 508 sysrq_trigger.write(c); 509 sysrq_trigger.close(); 510 } catch (IOException e) { 511 Slog.w(TAG, "Failed to write to /proc/sysrq-trigger", e); 512 } 513 } 514 dumpKernelStackTraces()515 private File dumpKernelStackTraces() { 516 String tracesPath = SystemProperties.get("dalvik.vm.stack-trace-file", null); 517 if (tracesPath == null || tracesPath.length() == 0) { 518 return null; 519 } 520 521 native_dumpKernelStacks(tracesPath); 522 return new File(tracesPath); 523 } 524 native_dumpKernelStacks(String tracesPath)525 private native void native_dumpKernelStacks(String tracesPath); 526 } 527