1 /* 2 * Copyright (C) 2008 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.server; 18 19 import android.app.IActivityController; 20 import android.os.Binder; 21 import android.os.RemoteException; 22 import com.android.server.am.ActivityManagerService; 23 import com.android.server.power.PowerManagerService; 24 25 import android.app.AlarmManager; 26 import android.app.PendingIntent; 27 import android.content.BroadcastReceiver; 28 import android.content.ContentResolver; 29 import android.content.Context; 30 import android.content.Intent; 31 import android.content.IntentFilter; 32 import android.os.BatteryManager; 33 import android.os.Debug; 34 import android.os.Handler; 35 import android.os.Looper; 36 import android.os.Process; 37 import android.os.ServiceManager; 38 import android.os.SystemClock; 39 import android.os.SystemProperties; 40 import android.util.EventLog; 41 import android.util.Log; 42 import android.util.Slog; 43 44 import java.io.File; 45 import java.io.FileWriter; 46 import java.io.IOException; 47 import java.util.ArrayList; 48 import java.util.Calendar; 49 50 /** This class calls its monitor every minute. Killing this process if they don't return **/ 51 public class Watchdog extends Thread { 52 static final String TAG = "Watchdog"; 53 static final boolean localLOGV = false || false; 54 55 // Set this to true to use debug default values. 56 static final boolean DB = false; 57 58 // Set this to true to have the watchdog record kernel thread stacks when it fires 59 static final boolean RECORD_KERNEL_THREADS = true; 60 61 static final long DEFAULT_TIMEOUT = DB ? 10*1000 : 60*1000; 62 static final long CHECK_INTERVAL = DEFAULT_TIMEOUT / 2; 63 64 // These are temporally ordered: larger values as lateness increases 65 static final int COMPLETED = 0; 66 static final int WAITING = 1; 67 static final int WAITED_HALF = 2; 68 static final int OVERDUE = 3; 69 70 // Which native processes to dump into dropbox's stack traces 71 public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] { 72 "/system/bin/mediaserver", 73 "/system/bin/sdcard", 74 "/system/bin/surfaceflinger" 75 }; 76 77 static Watchdog sWatchdog; 78 79 /* This handler will be used to post message back onto the main thread */ 80 final ArrayList<HandlerChecker> mHandlerCheckers = new ArrayList<HandlerChecker>(); 81 final HandlerChecker mMonitorChecker; 82 ContentResolver mResolver; 83 BatteryService mBattery; 84 PowerManagerService mPower; 85 AlarmManagerService mAlarm; 86 ActivityManagerService mActivity; 87 88 int mPhonePid; 89 IActivityController mController; 90 boolean mAllowRestart = true; 91 92 /** 93 * Used for checking status of handle threads and scheduling monitor callbacks. 94 */ 95 public final class HandlerChecker implements Runnable { 96 private final Handler mHandler; 97 private final String mName; 98 private final long mWaitMax; 99 private final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>(); 100 private boolean mCompleted; 101 private Monitor mCurrentMonitor; 102 private long mStartTime; 103 HandlerChecker(Handler handler, String name, long waitMaxMillis)104 HandlerChecker(Handler handler, String name, long waitMaxMillis) { 105 mHandler = handler; 106 mName = name; 107 mWaitMax = waitMaxMillis; 108 mCompleted = true; 109 } 110 addMonitor(Monitor monitor)111 public void addMonitor(Monitor monitor) { 112 mMonitors.add(monitor); 113 } 114 scheduleCheckLocked()115 public void scheduleCheckLocked() { 116 if (mMonitors.size() == 0 && mHandler.getLooper().isIdling()) { 117 // If the target looper is or just recently was idling, then 118 // there is no reason to enqueue our checker on it since that 119 // is as good as it not being deadlocked. This avoid having 120 // to do a context switch to check the thread. Note that we 121 // only do this if mCheckReboot is false and we have no 122 // monitors, since those would need to be executed at this point. 123 mCompleted = true; 124 return; 125 } 126 127 if (!mCompleted) { 128 // we already have a check in flight, so no need 129 return; 130 } 131 132 mCompleted = false; 133 mCurrentMonitor = null; 134 mStartTime = SystemClock.uptimeMillis(); 135 mHandler.postAtFrontOfQueue(this); 136 } 137 isOverdueLocked()138 public boolean isOverdueLocked() { 139 return (!mCompleted) && (SystemClock.uptimeMillis() > mStartTime + mWaitMax); 140 } 141 getCompletionStateLocked()142 public int getCompletionStateLocked() { 143 if (mCompleted) { 144 return COMPLETED; 145 } else { 146 long latency = SystemClock.uptimeMillis() - mStartTime; 147 if (latency < mWaitMax/2) { 148 return WAITING; 149 } else if (latency < mWaitMax) { 150 return WAITED_HALF; 151 } 152 } 153 return OVERDUE; 154 } 155 getThread()156 public Thread getThread() { 157 return mHandler.getLooper().getThread(); 158 } 159 getName()160 public String getName() { 161 return mName; 162 } 163 describeBlockedStateLocked()164 public String describeBlockedStateLocked() { 165 if (mCurrentMonitor == null) { 166 return "Blocked in handler on " + mName + " (" + getThread().getName() + ")"; 167 } else { 168 return "Blocked in monitor " + mCurrentMonitor.getClass().getName() 169 + " on " + mName + " (" + getThread().getName() + ")"; 170 } 171 } 172 173 @Override run()174 public void run() { 175 final int size = mMonitors.size(); 176 for (int i = 0 ; i < size ; i++) { 177 synchronized (Watchdog.this) { 178 mCurrentMonitor = mMonitors.get(i); 179 } 180 mCurrentMonitor.monitor(); 181 } 182 183 synchronized (Watchdog.this) { 184 mCompleted = true; 185 mCurrentMonitor = null; 186 } 187 } 188 } 189 190 final class RebootRequestReceiver extends BroadcastReceiver { 191 @Override onReceive(Context c, Intent intent)192 public void onReceive(Context c, Intent intent) { 193 if (intent.getIntExtra("nowait", 0) != 0) { 194 rebootSystem("Received ACTION_REBOOT broadcast"); 195 return; 196 } 197 Slog.w(TAG, "Unsupported ACTION_REBOOT broadcast: " + intent); 198 } 199 } 200 201 public interface Monitor { monitor()202 void monitor(); 203 } 204 getInstance()205 public static Watchdog getInstance() { 206 if (sWatchdog == null) { 207 sWatchdog = new Watchdog(); 208 } 209 210 return sWatchdog; 211 } 212 Watchdog()213 private Watchdog() { 214 super("watchdog"); 215 // Initialize handler checkers for each common thread we want to check. Note 216 // that we are not currently checking the background thread, since it can 217 // potentially hold longer running operations with no guarantees about the timeliness 218 // of operations there. 219 220 // The shared foreground thread is the main checker. It is where we 221 // will also dispatch monitor checks and do other work. 222 mMonitorChecker = new HandlerChecker(FgThread.getHandler(), 223 "foreground thread", DEFAULT_TIMEOUT); 224 mHandlerCheckers.add(mMonitorChecker); 225 // Add checker for main thread. We only do a quick check since there 226 // can be UI running on the thread. 227 mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()), 228 "main thread", DEFAULT_TIMEOUT)); 229 // Add checker for shared UI thread. 230 mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(), 231 "ui thread", DEFAULT_TIMEOUT)); 232 // And also check IO thread. 233 mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(), 234 "i/o thread", DEFAULT_TIMEOUT)); 235 } 236 init(Context context, BatteryService battery, PowerManagerService power, AlarmManagerService alarm, ActivityManagerService activity)237 public void init(Context context, BatteryService battery, 238 PowerManagerService power, AlarmManagerService alarm, 239 ActivityManagerService activity) { 240 mResolver = context.getContentResolver(); 241 mBattery = battery; 242 mPower = power; 243 mAlarm = alarm; 244 mActivity = activity; 245 246 context.registerReceiver(new RebootRequestReceiver(), 247 new IntentFilter(Intent.ACTION_REBOOT), 248 android.Manifest.permission.REBOOT, null); 249 } 250 processStarted(String name, int pid)251 public void processStarted(String name, int pid) { 252 synchronized (this) { 253 if ("com.android.phone".equals(name)) { 254 mPhonePid = pid; 255 } 256 } 257 } 258 setActivityController(IActivityController controller)259 public void setActivityController(IActivityController controller) { 260 synchronized (this) { 261 mController = controller; 262 } 263 } 264 setAllowRestart(boolean allowRestart)265 public void setAllowRestart(boolean allowRestart) { 266 synchronized (this) { 267 mAllowRestart = allowRestart; 268 } 269 } 270 addMonitor(Monitor monitor)271 public void addMonitor(Monitor monitor) { 272 synchronized (this) { 273 if (isAlive()) { 274 throw new RuntimeException("Monitors can't be added once the Watchdog is running"); 275 } 276 mMonitorChecker.addMonitor(monitor); 277 } 278 } 279 addThread(Handler thread, String name)280 public void addThread(Handler thread, String name) { 281 addThread(thread, name, DEFAULT_TIMEOUT); 282 } 283 addThread(Handler thread, String name, long timeoutMillis)284 public void addThread(Handler thread, String name, long timeoutMillis) { 285 synchronized (this) { 286 if (isAlive()) { 287 throw new RuntimeException("Threads can't be added once the Watchdog is running"); 288 } 289 mHandlerCheckers.add(new HandlerChecker(thread, name, timeoutMillis)); 290 } 291 } 292 293 /** 294 * Perform a full reboot of the system. 295 */ rebootSystem(String reason)296 void rebootSystem(String reason) { 297 Slog.i(TAG, "Rebooting system because: " + reason); 298 PowerManagerService pms = (PowerManagerService) ServiceManager.getService("power"); 299 pms.reboot(false, reason, false); 300 } 301 evaluateCheckerCompletionLocked()302 private int evaluateCheckerCompletionLocked() { 303 int state = COMPLETED; 304 for (int i=0; i<mHandlerCheckers.size(); i++) { 305 HandlerChecker hc = mHandlerCheckers.get(i); 306 state = Math.max(state, hc.getCompletionStateLocked()); 307 } 308 return state; 309 } 310 getBlockedCheckersLocked()311 private ArrayList<HandlerChecker> getBlockedCheckersLocked() { 312 ArrayList<HandlerChecker> checkers = new ArrayList<HandlerChecker>(); 313 for (int i=0; i<mHandlerCheckers.size(); i++) { 314 HandlerChecker hc = mHandlerCheckers.get(i); 315 if (hc.isOverdueLocked()) { 316 checkers.add(hc); 317 } 318 } 319 return checkers; 320 } 321 describeCheckersLocked(ArrayList<HandlerChecker> checkers)322 private String describeCheckersLocked(ArrayList<HandlerChecker> checkers) { 323 StringBuilder builder = new StringBuilder(128); 324 for (int i=0; i<checkers.size(); i++) { 325 if (builder.length() > 0) { 326 builder.append(", "); 327 } 328 builder.append(checkers.get(i).describeBlockedStateLocked()); 329 } 330 return builder.toString(); 331 } 332 333 @Override run()334 public void run() { 335 boolean waitedHalf = false; 336 while (true) { 337 final ArrayList<HandlerChecker> blockedCheckers; 338 final String subject; 339 final boolean allowRestart; 340 synchronized (this) { 341 long timeout = CHECK_INTERVAL; 342 // Make sure we (re)spin the checkers that have become idle within 343 // this wait-and-check interval 344 for (int i=0; i<mHandlerCheckers.size(); i++) { 345 HandlerChecker hc = mHandlerCheckers.get(i); 346 hc.scheduleCheckLocked(); 347 } 348 349 // NOTE: We use uptimeMillis() here because we do not want to increment the time we 350 // wait while asleep. If the device is asleep then the thing that we are waiting 351 // to timeout on is asleep as well and won't have a chance to run, causing a false 352 // positive on when to kill things. 353 long start = SystemClock.uptimeMillis(); 354 while (timeout > 0) { 355 try { 356 wait(timeout); 357 } catch (InterruptedException e) { 358 Log.wtf(TAG, e); 359 } 360 timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start); 361 } 362 363 final int waitState = evaluateCheckerCompletionLocked(); 364 if (waitState == COMPLETED) { 365 // The monitors have returned; reset 366 waitedHalf = false; 367 continue; 368 } else if (waitState == WAITING) { 369 // still waiting but within their configured intervals; back off and recheck 370 continue; 371 } else if (waitState == WAITED_HALF) { 372 if (!waitedHalf) { 373 // We've waited half the deadlock-detection interval. Pull a stack 374 // trace and wait another half. 375 ArrayList<Integer> pids = new ArrayList<Integer>(); 376 pids.add(Process.myPid()); 377 ActivityManagerService.dumpStackTraces(true, pids, null, null, 378 NATIVE_STACKS_OF_INTEREST); 379 waitedHalf = true; 380 } 381 continue; 382 } 383 384 // something is overdue! 385 blockedCheckers = getBlockedCheckersLocked(); 386 subject = describeCheckersLocked(blockedCheckers); 387 allowRestart = mAllowRestart; 388 } 389 390 // If we got here, that means that the system is most likely hung. 391 // First collect stack traces from all threads of the system process. 392 // Then kill this process so that the system will restart. 393 EventLog.writeEvent(EventLogTags.WATCHDOG, subject); 394 395 ArrayList<Integer> pids = new ArrayList<Integer>(); 396 pids.add(Process.myPid()); 397 if (mPhonePid > 0) pids.add(mPhonePid); 398 // Pass !waitedHalf so that just in case we somehow wind up here without having 399 // dumped the halfway stacks, we properly re-initialize the trace file. 400 final File stack = ActivityManagerService.dumpStackTraces( 401 !waitedHalf, pids, null, null, NATIVE_STACKS_OF_INTEREST); 402 403 // Give some extra time to make sure the stack traces get written. 404 // The system's been hanging for a minute, another second or two won't hurt much. 405 SystemClock.sleep(2000); 406 407 // Pull our own kernel thread stacks as well if we're configured for that 408 if (RECORD_KERNEL_THREADS) { 409 dumpKernelStackTraces(); 410 } 411 412 // Trigger the kernel to dump all blocked threads to the kernel log 413 try { 414 FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger"); 415 sysrq_trigger.write("w"); 416 sysrq_trigger.close(); 417 } catch (IOException e) { 418 Slog.e(TAG, "Failed to write to /proc/sysrq-trigger"); 419 Slog.e(TAG, e.getMessage()); 420 } 421 422 // Try to add the error to the dropbox, but assuming that the ActivityManager 423 // itself may be deadlocked. (which has happened, causing this statement to 424 // deadlock and the watchdog as a whole to be ineffective) 425 Thread dropboxThread = new Thread("watchdogWriteToDropbox") { 426 public void run() { 427 mActivity.addErrorToDropBox( 428 "watchdog", null, "system_server", null, null, 429 subject, null, stack, null); 430 } 431 }; 432 dropboxThread.start(); 433 try { 434 dropboxThread.join(2000); // wait up to 2 seconds for it to return. 435 } catch (InterruptedException ignored) {} 436 437 IActivityController controller; 438 synchronized (this) { 439 controller = mController; 440 } 441 if (controller != null) { 442 Slog.i(TAG, "Reporting stuck state to activity controller"); 443 try { 444 Binder.setDumpDisabled("Service dumps disabled due to hung system process."); 445 // 1 = keep waiting, -1 = kill system 446 int res = controller.systemNotResponding(subject); 447 if (res >= 0) { 448 Slog.i(TAG, "Activity controller requested to coninue to wait"); 449 waitedHalf = false; 450 continue; 451 } 452 } catch (RemoteException e) { 453 } 454 } 455 456 // Only kill the process if the debugger is not attached. 457 if (Debug.isDebuggerConnected()) { 458 Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process"); 459 } else if (!allowRestart) { 460 Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process"); 461 } else { 462 Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject); 463 for (int i=0; i<blockedCheckers.size(); i++) { 464 Slog.w(TAG, blockedCheckers.get(i).getName() + " stack trace:"); 465 StackTraceElement[] stackTrace 466 = blockedCheckers.get(i).getThread().getStackTrace(); 467 for (StackTraceElement element: stackTrace) { 468 Slog.w(TAG, " at " + element); 469 } 470 } 471 Slog.w(TAG, "*** GOODBYE!"); 472 Process.killProcess(Process.myPid()); 473 System.exit(10); 474 } 475 476 waitedHalf = false; 477 } 478 } 479 dumpKernelStackTraces()480 private File dumpKernelStackTraces() { 481 String tracesPath = SystemProperties.get("dalvik.vm.stack-trace-file", null); 482 if (tracesPath == null || tracesPath.length() == 0) { 483 return null; 484 } 485 486 native_dumpKernelStacks(tracesPath); 487 return new File(tracesPath); 488 } 489 native_dumpKernelStacks(String tracesPath)490 private native void native_dumpKernelStacks(String tracesPath); 491 } 492