1 /* 2 * Copyright (C) 2008 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.server; 18 19 import static com.android.server.Watchdog.HandlerCheckerAndTimeout.withCustomTimeout; 20 import static com.android.server.Watchdog.HandlerCheckerAndTimeout.withDefaultTimeout; 21 22 import android.annotation.NonNull; 23 import android.annotation.Nullable; 24 import android.app.IActivityController; 25 import android.content.BroadcastReceiver; 26 import android.content.Context; 27 import android.content.Intent; 28 import android.content.IntentFilter; 29 import android.database.ContentObserver; 30 import android.hidl.manager.V1_0.IServiceManager; 31 import android.net.Uri; 32 import android.os.Binder; 33 import android.os.Build; 34 import android.os.Debug; 35 import android.os.FileUtils; 36 import android.os.Handler; 37 import android.os.IPowerManager; 38 import android.os.Looper; 39 import android.os.Process; 40 import android.os.RemoteException; 41 import android.os.ServiceDebugInfo; 42 import android.os.ServiceManager; 43 import android.os.SystemClock; 44 import android.os.SystemProperties; 45 import android.os.UserHandle; 46 import android.provider.Settings; 47 import android.sysprop.WatchdogProperties; 48 import android.util.Dumpable; 49 import android.util.EventLog; 50 import android.util.Log; 51 import android.util.Slog; 52 import android.util.SparseArray; 53 54 import com.android.internal.os.BackgroundThread; 55 import com.android.internal.os.ProcessCpuTracker; 56 import com.android.internal.os.ZygoteConnectionConstants; 57 import com.android.internal.util.FrameworkStatsLog; 58 import com.android.server.am.ActivityManagerService; 59 import com.android.server.am.TraceErrorLogger; 60 import com.android.server.criticalevents.CriticalEventLog; 61 import com.android.server.wm.SurfaceAnimationThread; 62 63 import java.io.BufferedReader; 64 import java.io.File; 65 import java.io.FileNotFoundException; 66 import java.io.FileReader; 67 import java.io.FileWriter; 68 import java.io.IOException; 69 import java.io.PrintWriter; 70 import java.io.StringWriter; 71 import java.util.ArrayList; 72 import java.util.Arrays; 73 import java.util.Collections; 74 import java.util.HashSet; 75 import java.util.List; 76 import java.util.Optional; 77 import java.util.UUID; 78 import java.util.concurrent.TimeUnit; 79 80 /** 81 * This class calls its monitor every minute. Killing this process if they don't return 82 **/ 83 public class Watchdog implements Dumpable { 84 static final String TAG = "Watchdog"; 85 86 /** Debug flag. */ 87 public static final boolean DEBUG = false; 88 89 // Set this to true to use debug default values. 90 private static final boolean DB = false; 91 92 // Note 1: Do not lower this value below thirty seconds without tightening the invoke-with 93 // timeout in com.android.internal.os.ZygoteConnection, or wrapped applications 94 // can trigger the watchdog. 95 // Note 2: The debug value is already below the wait time in ZygoteConnection. Wrapped 96 // applications may not work with a debug build. CTS will fail. 97 private static final long DEFAULT_TIMEOUT = DB ? 10 * 1000 : 60 * 1000; 98 99 // These are temporally ordered: larger values as lateness increases 100 private static final int COMPLETED = 0; 101 private static final int WAITING = 1; 102 private static final int WAITED_HALF = 2; 103 private static final int OVERDUE = 3; 104 105 // Track watchdog timeout history and break the crash loop if there is. 106 private static final String TIMEOUT_HISTORY_FILE = "/data/system/watchdog-timeout-history.txt"; 107 private static final String PROP_FATAL_LOOP_COUNT = "framework_watchdog.fatal_count"; 108 private static final String PROP_FATAL_LOOP_WINDOWS_SECS = 109 "framework_watchdog.fatal_window.second"; 110 111 // Which native processes to dump into dropbox's stack traces 112 public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] { 113 "/system/bin/audioserver", 114 "/system/bin/cameraserver", 115 "/system/bin/drmserver", 116 "/system/bin/keystore2", 117 "/system/bin/mediadrmserver", 118 "/system/bin/mediaserver", 119 "/system/bin/netd", 120 "/system/bin/sdcard", 121 "/system/bin/surfaceflinger", 122 "/system/bin/vold", 123 "media.extractor", // system/bin/mediaextractor 124 "media.metrics", // system/bin/mediametrics 125 "media.codec", // vendor/bin/hw/android.hardware.media.omx@1.0-service 126 "media.swcodec", // /apex/com.android.media.swcodec/bin/mediaswcodec 127 "media.transcoding", // Media transcoding service 128 "com.android.bluetooth", // Bluetooth service 129 "/apex/com.android.os.statsd/bin/statsd", // Stats daemon 130 }; 131 132 public static final List<String> HAL_INTERFACES_OF_INTEREST = Arrays.asList( 133 "android.hardware.audio@4.0::IDevicesFactory", 134 "android.hardware.audio@5.0::IDevicesFactory", 135 "android.hardware.audio@6.0::IDevicesFactory", 136 "android.hardware.audio@7.0::IDevicesFactory", 137 "android.hardware.biometrics.face@1.0::IBiometricsFace", 138 "android.hardware.biometrics.fingerprint@2.1::IBiometricsFingerprint", 139 "android.hardware.bluetooth@1.0::IBluetoothHci", 140 "android.hardware.camera.provider@2.4::ICameraProvider", 141 "android.hardware.gnss@1.0::IGnss", 142 "android.hardware.graphics.allocator@2.0::IAllocator", 143 "android.hardware.graphics.composer@2.1::IComposer", 144 "android.hardware.health@2.0::IHealth", 145 "android.hardware.light@2.0::ILight", 146 "android.hardware.media.c2@1.0::IComponentStore", 147 "android.hardware.media.omx@1.0::IOmx", 148 "android.hardware.media.omx@1.0::IOmxStore", 149 "android.hardware.neuralnetworks@1.0::IDevice", 150 "android.hardware.power@1.0::IPower", 151 "android.hardware.power.stats@1.0::IPowerStats", 152 "android.hardware.sensors@1.0::ISensors", 153 "android.hardware.sensors@2.0::ISensors", 154 "android.hardware.sensors@2.1::ISensors", 155 "android.hardware.vibrator@1.0::IVibrator", 156 "android.hardware.vr@1.0::IVr", 157 "android.system.suspend@1.0::ISystemSuspend" 158 ); 159 160 public static final String[] AIDL_INTERFACE_PREFIXES_OF_INTEREST = new String[] { 161 "android.hardware.biometrics.face.IFace/", 162 "android.hardware.biometrics.fingerprint.IFingerprint/", 163 "android.hardware.graphics.composer3.IComposer/", 164 "android.hardware.input.processor.IInputProcessor/", 165 "android.hardware.light.ILights/", 166 "android.hardware.power.IPower/", 167 "android.hardware.power.stats.IPowerStats/", 168 "android.hardware.vibrator.IVibrator/", 169 "android.hardware.vibrator.IVibratorManager/" 170 }; 171 172 private static Watchdog sWatchdog; 173 174 private final Thread mThread; 175 176 private final Object mLock = new Object(); 177 178 /* This handler will be used to post message back onto the main thread */ 179 private final ArrayList<HandlerCheckerAndTimeout> mHandlerCheckers = new ArrayList<>(); 180 private final HandlerChecker mMonitorChecker; 181 private ActivityManagerService mActivity; 182 private IActivityController mController; 183 private boolean mAllowRestart = true; 184 // We start with DEFAULT_TIMEOUT. This will then be update with the timeout values from Settings 185 // once the settings provider is initialized. 186 private volatile long mWatchdogTimeoutMillis = DEFAULT_TIMEOUT; 187 private final List<Integer> mInterestingJavaPids = new ArrayList<>(); 188 private final TraceErrorLogger mTraceErrorLogger; 189 190 /** Holds a checker and its timeout. */ 191 static final class HandlerCheckerAndTimeout { 192 private final HandlerChecker mHandler; 193 private final Optional<Long> mCustomTimeoutMillis; 194 HandlerCheckerAndTimeout(HandlerChecker checker, Optional<Long> timeoutMillis)195 private HandlerCheckerAndTimeout(HandlerChecker checker, Optional<Long> timeoutMillis) { 196 this.mHandler = checker; 197 this.mCustomTimeoutMillis = timeoutMillis; 198 } 199 checker()200 HandlerChecker checker() { 201 return mHandler; 202 } 203 204 /** Returns the timeout. */ customTimeoutMillis()205 Optional<Long> customTimeoutMillis() { 206 return mCustomTimeoutMillis; 207 } 208 209 /** 210 * Creates a checker with the default timeout. The timeout will use the default value which 211 * is configurable server-side. 212 */ withDefaultTimeout(HandlerChecker checker)213 static HandlerCheckerAndTimeout withDefaultTimeout(HandlerChecker checker) { 214 return new HandlerCheckerAndTimeout(checker, Optional.empty()); 215 } 216 217 /** 218 * Creates a checker with a custom timeout. The timeout overrides the default value and will 219 * always be used. 220 */ withCustomTimeout( HandlerChecker checker, long timeoutMillis)221 static HandlerCheckerAndTimeout withCustomTimeout( 222 HandlerChecker checker, long timeoutMillis) { 223 return new HandlerCheckerAndTimeout(checker, Optional.of(timeoutMillis)); 224 } 225 } 226 227 /** 228 * Used for checking status of handle threads and scheduling monitor callbacks. 229 */ 230 public final class HandlerChecker implements Runnable { 231 private final Handler mHandler; 232 private final String mName; 233 private final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>(); 234 private final ArrayList<Monitor> mMonitorQueue = new ArrayList<Monitor>(); 235 private long mWaitMax; 236 private boolean mCompleted; 237 private Monitor mCurrentMonitor; 238 private long mStartTime; 239 private int mPauseCount; 240 HandlerChecker(Handler handler, String name)241 HandlerChecker(Handler handler, String name) { 242 mHandler = handler; 243 mName = name; 244 mCompleted = true; 245 } 246 addMonitorLocked(Monitor monitor)247 void addMonitorLocked(Monitor monitor) { 248 // We don't want to update mMonitors when the Handler is in the middle of checking 249 // all monitors. We will update mMonitors on the next schedule if it is safe 250 mMonitorQueue.add(monitor); 251 } 252 253 /** 254 * Schedules a run on the handler thread. 255 * 256 * @param handlerCheckerTimeoutMillis the timeout to use for this run 257 */ scheduleCheckLocked(long handlerCheckerTimeoutMillis)258 public void scheduleCheckLocked(long handlerCheckerTimeoutMillis) { 259 mWaitMax = handlerCheckerTimeoutMillis; 260 if (mCompleted) { 261 // Safe to update monitors in queue, Handler is not in the middle of work 262 mMonitors.addAll(mMonitorQueue); 263 mMonitorQueue.clear(); 264 } 265 if ((mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling()) 266 || (mPauseCount > 0)) { 267 // Don't schedule until after resume OR 268 // If the target looper has recently been polling, then 269 // there is no reason to enqueue our checker on it since that 270 // is as good as it not being deadlocked. This avoid having 271 // to do a context switch to check the thread. Note that we 272 // only do this if we have no monitors since those would need to 273 // be executed at this point. 274 mCompleted = true; 275 return; 276 } 277 if (!mCompleted) { 278 // we already have a check in flight, so no need 279 return; 280 } 281 282 mCompleted = false; 283 mCurrentMonitor = null; 284 mStartTime = SystemClock.uptimeMillis(); 285 mHandler.postAtFrontOfQueue(this); 286 } 287 getCompletionStateLocked()288 public int getCompletionStateLocked() { 289 if (mCompleted) { 290 return COMPLETED; 291 } else { 292 long latency = SystemClock.uptimeMillis() - mStartTime; 293 if (latency < mWaitMax/2) { 294 return WAITING; 295 } else if (latency < mWaitMax) { 296 return WAITED_HALF; 297 } 298 } 299 return OVERDUE; 300 } 301 getThread()302 public Thread getThread() { 303 return mHandler.getLooper().getThread(); 304 } 305 getName()306 public String getName() { 307 return mName; 308 } 309 describeBlockedStateLocked()310 String describeBlockedStateLocked() { 311 if (mCurrentMonitor == null) { 312 return "Blocked in handler on " + mName + " (" + getThread().getName() + ")"; 313 } else { 314 return "Blocked in monitor " + mCurrentMonitor.getClass().getName() 315 + " on " + mName + " (" + getThread().getName() + ")"; 316 } 317 } 318 319 @Override run()320 public void run() { 321 // Once we get here, we ensure that mMonitors does not change even if we call 322 // #addMonitorLocked because we first add the new monitors to mMonitorQueue and 323 // move them to mMonitors on the next schedule when mCompleted is true, at which 324 // point we have completed execution of this method. 325 final int size = mMonitors.size(); 326 for (int i = 0 ; i < size ; i++) { 327 synchronized (mLock) { 328 mCurrentMonitor = mMonitors.get(i); 329 } 330 mCurrentMonitor.monitor(); 331 } 332 333 synchronized (mLock) { 334 mCompleted = true; 335 mCurrentMonitor = null; 336 } 337 } 338 339 /** Pause the HandlerChecker. */ pauseLocked(String reason)340 public void pauseLocked(String reason) { 341 mPauseCount++; 342 // Mark as completed, because there's a chance we called this after the watchog 343 // thread loop called Object#wait after 'WAITED_HALF'. In that case we want to ensure 344 // the next call to #getCompletionStateLocked for this checker returns 'COMPLETED' 345 mCompleted = true; 346 Slog.i(TAG, "Pausing HandlerChecker: " + mName + " for reason: " 347 + reason + ". Pause count: " + mPauseCount); 348 } 349 350 /** Resume the HandlerChecker from the last {@link #pauseLocked}. */ resumeLocked(String reason)351 public void resumeLocked(String reason) { 352 if (mPauseCount > 0) { 353 mPauseCount--; 354 Slog.i(TAG, "Resuming HandlerChecker: " + mName + " for reason: " 355 + reason + ". Pause count: " + mPauseCount); 356 } else { 357 Slog.wtf(TAG, "Already resumed HandlerChecker: " + mName); 358 } 359 } 360 } 361 362 final class RebootRequestReceiver extends BroadcastReceiver { 363 @Override onReceive(Context c, Intent intent)364 public void onReceive(Context c, Intent intent) { 365 if (intent.getIntExtra("nowait", 0) != 0) { 366 rebootSystem("Received ACTION_REBOOT broadcast"); 367 return; 368 } 369 Slog.w(TAG, "Unsupported ACTION_REBOOT broadcast: " + intent); 370 } 371 } 372 373 /** Monitor for checking the availability of binder threads. The monitor will block until 374 * there is a binder thread available to process in coming IPCs to make sure other processes 375 * can still communicate with the service. 376 */ 377 private static final class BinderThreadMonitor implements Watchdog.Monitor { 378 @Override monitor()379 public void monitor() { 380 Binder.blockUntilThreadAvailable(); 381 } 382 } 383 384 public interface Monitor { monitor()385 void monitor(); 386 } 387 getInstance()388 public static Watchdog getInstance() { 389 if (sWatchdog == null) { 390 sWatchdog = new Watchdog(); 391 } 392 393 return sWatchdog; 394 } 395 Watchdog()396 private Watchdog() { 397 mThread = new Thread(this::run, "watchdog"); 398 399 // Initialize handler checkers for each common thread we want to check. Note 400 // that we are not currently checking the background thread, since it can 401 // potentially hold longer running operations with no guarantees about the timeliness 402 // of operations there. 403 // 404 // The shared foreground thread is the main checker. It is where we 405 // will also dispatch monitor checks and do other work. 406 mMonitorChecker = new HandlerChecker(FgThread.getHandler(), 407 "foreground thread"); 408 mHandlerCheckers.add(withDefaultTimeout(mMonitorChecker)); 409 // Add checker for main thread. We only do a quick check since there 410 // can be UI running on the thread. 411 mHandlerCheckers.add(withDefaultTimeout( 412 new HandlerChecker(new Handler(Looper.getMainLooper()), "main thread"))); 413 // Add checker for shared UI thread. 414 mHandlerCheckers.add(withDefaultTimeout( 415 new HandlerChecker(UiThread.getHandler(), "ui thread"))); 416 // And also check IO thread. 417 mHandlerCheckers.add(withDefaultTimeout( 418 new HandlerChecker(IoThread.getHandler(), "i/o thread"))); 419 // And the display thread. 420 mHandlerCheckers.add(withDefaultTimeout( 421 new HandlerChecker(DisplayThread.getHandler(), "display thread"))); 422 // And the animation thread. 423 mHandlerCheckers.add(withDefaultTimeout( 424 new HandlerChecker(AnimationThread.getHandler(), "animation thread"))); 425 // And the surface animation thread. 426 mHandlerCheckers.add(withDefaultTimeout( 427 new HandlerChecker(SurfaceAnimationThread.getHandler(), 428 "surface animation thread"))); 429 // Initialize monitor for Binder threads. 430 addMonitor(new BinderThreadMonitor()); 431 432 mInterestingJavaPids.add(Process.myPid()); 433 434 // See the notes on DEFAULT_TIMEOUT. 435 assert DB || 436 DEFAULT_TIMEOUT > ZygoteConnectionConstants.WRAPPED_PID_TIMEOUT_MILLIS; 437 438 mTraceErrorLogger = new TraceErrorLogger(); 439 } 440 441 /** 442 * Called by SystemServer to cause the internal thread to begin execution. 443 */ start()444 public void start() { 445 mThread.start(); 446 } 447 448 /** 449 * Registers a {@link BroadcastReceiver} to listen to reboot broadcasts and trigger reboot. 450 * Should be called during boot after the ActivityManagerService is up and registered 451 * as a system service so it can handle registration of a {@link BroadcastReceiver}. 452 */ init(Context context, ActivityManagerService activity)453 public void init(Context context, ActivityManagerService activity) { 454 mActivity = activity; 455 context.registerReceiver(new RebootRequestReceiver(), 456 new IntentFilter(Intent.ACTION_REBOOT), 457 android.Manifest.permission.REBOOT, null); 458 } 459 460 private static class SettingsObserver extends ContentObserver { 461 private final Uri mUri = Settings.Global.getUriFor(Settings.Global.WATCHDOG_TIMEOUT_MILLIS); 462 private final Context mContext; 463 private final Watchdog mWatchdog; 464 SettingsObserver(Context context, Watchdog watchdog)465 SettingsObserver(Context context, Watchdog watchdog) { 466 super(BackgroundThread.getHandler()); 467 mContext = context; 468 mWatchdog = watchdog; 469 // Always kick once to ensure that we match current state 470 onChange(); 471 } 472 473 @Override onChange(boolean selfChange, Uri uri, int userId)474 public void onChange(boolean selfChange, Uri uri, int userId) { 475 if (mUri.equals(uri)) { 476 onChange(); 477 } 478 } 479 onChange()480 public void onChange() { 481 try { 482 mWatchdog.updateWatchdogTimeout(Settings.Global.getLong( 483 mContext.getContentResolver(), 484 Settings.Global.WATCHDOG_TIMEOUT_MILLIS, DEFAULT_TIMEOUT)); 485 } catch (RuntimeException e) { 486 Slog.e(TAG, "Exception while reading settings " + e.getMessage(), e); 487 } 488 } 489 } 490 491 /** 492 * Register an observer to listen to settings. 493 * 494 * It needs to be called after the settings service is initialized. 495 */ registerSettingsObserver(Context context)496 public void registerSettingsObserver(Context context) { 497 context.getContentResolver().registerContentObserver( 498 Settings.Global.getUriFor(Settings.Global.WATCHDOG_TIMEOUT_MILLIS), 499 false, 500 new SettingsObserver(context, this), 501 UserHandle.USER_SYSTEM); 502 } 503 504 /** 505 * Updates watchdog timeout values. 506 */ updateWatchdogTimeout(long timeoutMillis)507 void updateWatchdogTimeout(long timeoutMillis) { 508 // See the notes on DEFAULT_TIMEOUT. 509 if (!DB && timeoutMillis <= ZygoteConnectionConstants.WRAPPED_PID_TIMEOUT_MILLIS) { 510 timeoutMillis = ZygoteConnectionConstants.WRAPPED_PID_TIMEOUT_MILLIS + 1; 511 } 512 mWatchdogTimeoutMillis = timeoutMillis; 513 Slog.i(TAG, "Watchdog timeout updated to " + mWatchdogTimeoutMillis + " millis"); 514 } 515 isInterestingJavaProcess(String processName)516 private static boolean isInterestingJavaProcess(String processName) { 517 return processName.equals(StorageManagerService.sMediaStoreAuthorityProcessName) 518 || processName.equals("com.android.phone"); 519 } 520 521 /** 522 * Notifies the watchdog when a Java process with {@code pid} is started. 523 * This process may have its stack trace dumped during an ANR. 524 */ processStarted(String processName, int pid)525 public void processStarted(String processName, int pid) { 526 if (isInterestingJavaProcess(processName)) { 527 Slog.i(TAG, "Interesting Java process " + processName + " started. Pid " + pid); 528 synchronized (mLock) { 529 mInterestingJavaPids.add(pid); 530 } 531 } 532 } 533 534 /** 535 * Notifies the watchdog when a Java process with {@code pid} dies. 536 */ processDied(String processName, int pid)537 public void processDied(String processName, int pid) { 538 if (isInterestingJavaProcess(processName)) { 539 Slog.i(TAG, "Interesting Java process " + processName + " died. Pid " + pid); 540 synchronized (mLock) { 541 mInterestingJavaPids.remove(Integer.valueOf(pid)); 542 } 543 } 544 } 545 setActivityController(IActivityController controller)546 public void setActivityController(IActivityController controller) { 547 synchronized (mLock) { 548 mController = controller; 549 } 550 } 551 setAllowRestart(boolean allowRestart)552 public void setAllowRestart(boolean allowRestart) { 553 synchronized (mLock) { 554 mAllowRestart = allowRestart; 555 } 556 } 557 addMonitor(Monitor monitor)558 public void addMonitor(Monitor monitor) { 559 synchronized (mLock) { 560 mMonitorChecker.addMonitorLocked(monitor); 561 } 562 } 563 addThread(Handler thread)564 public void addThread(Handler thread) { 565 synchronized (mLock) { 566 final String name = thread.getLooper().getThread().getName(); 567 mHandlerCheckers.add(withDefaultTimeout(new HandlerChecker(thread, name))); 568 } 569 } 570 addThread(Handler thread, long timeoutMillis)571 public void addThread(Handler thread, long timeoutMillis) { 572 synchronized (mLock) { 573 final String name = thread.getLooper().getThread().getName(); 574 mHandlerCheckers.add( 575 withCustomTimeout(new HandlerChecker(thread, name), timeoutMillis)); 576 } 577 } 578 579 /** 580 * Pauses Watchdog action for the currently running thread. Useful before executing long running 581 * operations that could falsely trigger the watchdog. Each call to this will require a matching 582 * call to {@link #resumeWatchingCurrentThread}. 583 * 584 * <p>If the current thread has not been added to the Watchdog, this call is a no-op. 585 * 586 * <p>If the Watchdog is already paused for the current thread, this call adds 587 * adds another pause and will require an additional {@link #resumeCurrentThread} to resume. 588 * 589 * <p>Note: Use with care, as any deadlocks on the current thread will be undetected until all 590 * pauses have been resumed. 591 */ pauseWatchingCurrentThread(String reason)592 public void pauseWatchingCurrentThread(String reason) { 593 synchronized (mLock) { 594 for (HandlerCheckerAndTimeout hc : mHandlerCheckers) { 595 HandlerChecker checker = hc.checker(); 596 if (Thread.currentThread().equals(checker.getThread())) { 597 checker.pauseLocked(reason); 598 } 599 } 600 } 601 } 602 603 /** 604 * Resumes the last pause from {@link #pauseWatchingCurrentThread} for the currently running 605 * thread. 606 * 607 * <p>If the current thread has not been added to the Watchdog, this call is a no-op. 608 * 609 * <p>If the Watchdog action for the current thread is already resumed, this call logs a wtf. 610 * 611 * <p>If all pauses have been resumed, the Watchdog action is finally resumed, otherwise, 612 * the Watchdog action for the current thread remains paused until resume is called at least 613 * as many times as the calls to pause. 614 */ resumeWatchingCurrentThread(String reason)615 public void resumeWatchingCurrentThread(String reason) { 616 synchronized (mLock) { 617 for (HandlerCheckerAndTimeout hc : mHandlerCheckers) { 618 HandlerChecker checker = hc.checker(); 619 if (Thread.currentThread().equals(checker.getThread())) { 620 checker.resumeLocked(reason); 621 } 622 } 623 } 624 } 625 626 /** 627 * Perform a full reboot of the system. 628 */ rebootSystem(String reason)629 void rebootSystem(String reason) { 630 Slog.i(TAG, "Rebooting system because: " + reason); 631 IPowerManager pms = (IPowerManager)ServiceManager.getService(Context.POWER_SERVICE); 632 try { 633 pms.reboot(false, reason, false); 634 } catch (RemoteException ex) { 635 } 636 } 637 evaluateCheckerCompletionLocked()638 private int evaluateCheckerCompletionLocked() { 639 int state = COMPLETED; 640 for (int i=0; i<mHandlerCheckers.size(); i++) { 641 HandlerChecker hc = mHandlerCheckers.get(i).checker(); 642 state = Math.max(state, hc.getCompletionStateLocked()); 643 } 644 return state; 645 } 646 getCheckersWithStateLocked(int completionState)647 private ArrayList<HandlerChecker> getCheckersWithStateLocked(int completionState) { 648 ArrayList<HandlerChecker> checkers = new ArrayList<HandlerChecker>(); 649 for (int i=0; i<mHandlerCheckers.size(); i++) { 650 HandlerChecker hc = mHandlerCheckers.get(i).checker(); 651 if (hc.getCompletionStateLocked() == completionState) { 652 checkers.add(hc); 653 } 654 } 655 return checkers; 656 } 657 describeCheckersLocked(List<HandlerChecker> checkers)658 private String describeCheckersLocked(List<HandlerChecker> checkers) { 659 StringBuilder builder = new StringBuilder(128); 660 for (int i=0; i<checkers.size(); i++) { 661 if (builder.length() > 0) { 662 builder.append(", "); 663 } 664 builder.append(checkers.get(i).describeBlockedStateLocked()); 665 } 666 return builder.toString(); 667 } 668 addInterestingHidlPids(HashSet<Integer> pids)669 private static void addInterestingHidlPids(HashSet<Integer> pids) { 670 try { 671 IServiceManager serviceManager = IServiceManager.getService(); 672 ArrayList<IServiceManager.InstanceDebugInfo> dump = 673 serviceManager.debugDump(); 674 for (IServiceManager.InstanceDebugInfo info : dump) { 675 if (info.pid == IServiceManager.PidConstant.NO_PID) { 676 continue; 677 } 678 679 if (!HAL_INTERFACES_OF_INTEREST.contains(info.interfaceName)) { 680 continue; 681 } 682 683 pids.add(info.pid); 684 } 685 } catch (RemoteException e) { 686 Log.w(TAG, e); 687 } 688 } 689 addInterestingAidlPids(HashSet<Integer> pids)690 private static void addInterestingAidlPids(HashSet<Integer> pids) { 691 ServiceDebugInfo[] infos = ServiceManager.getServiceDebugInfo(); 692 if (infos == null) return; 693 694 for (ServiceDebugInfo info : infos) { 695 for (String prefix : AIDL_INTERFACE_PREFIXES_OF_INTEREST) { 696 if (info.name.startsWith(prefix)) { 697 pids.add(info.debugPid); 698 } 699 } 700 } 701 } 702 getInterestingNativePids()703 static ArrayList<Integer> getInterestingNativePids() { 704 HashSet<Integer> pids = new HashSet<>(); 705 addInterestingAidlPids(pids); 706 addInterestingHidlPids(pids); 707 708 int[] nativePids = Process.getPidsForCommands(NATIVE_STACKS_OF_INTEREST); 709 if (nativePids != null) { 710 for (int i : nativePids) { 711 pids.add(i); 712 } 713 } 714 715 return new ArrayList<Integer>(pids); 716 } 717 run()718 private void run() { 719 boolean waitedHalf = false; 720 721 while (true) { 722 List<HandlerChecker> blockedCheckers = Collections.emptyList(); 723 String subject = ""; 724 boolean allowRestart = true; 725 int debuggerWasConnected = 0; 726 boolean doWaitedHalfDump = false; 727 // The value of mWatchdogTimeoutMillis might change while we are executing the loop. 728 // We store the current value to use a consistent value for all handlers. 729 final long watchdogTimeoutMillis = mWatchdogTimeoutMillis; 730 final long checkIntervalMillis = watchdogTimeoutMillis / 2; 731 final ArrayList<Integer> pids; 732 synchronized (mLock) { 733 long timeout = checkIntervalMillis; 734 // Make sure we (re)spin the checkers that have become idle within 735 // this wait-and-check interval 736 for (int i=0; i<mHandlerCheckers.size(); i++) { 737 HandlerCheckerAndTimeout hc = mHandlerCheckers.get(i); 738 // We pick the watchdog to apply every time we reschedule the checkers. The 739 // default timeout might have changed since the last run. 740 hc.checker().scheduleCheckLocked(hc.customTimeoutMillis() 741 .orElse(watchdogTimeoutMillis * Build.HW_TIMEOUT_MULTIPLIER)); 742 } 743 744 if (debuggerWasConnected > 0) { 745 debuggerWasConnected--; 746 } 747 748 // NOTE: We use uptimeMillis() here because we do not want to increment the time we 749 // wait while asleep. If the device is asleep then the thing that we are waiting 750 // to timeout on is asleep as well and won't have a chance to run, causing a false 751 // positive on when to kill things. 752 long start = SystemClock.uptimeMillis(); 753 while (timeout > 0) { 754 if (Debug.isDebuggerConnected()) { 755 debuggerWasConnected = 2; 756 } 757 try { 758 mLock.wait(timeout); 759 // Note: mHandlerCheckers and mMonitorChecker may have changed after waiting 760 } catch (InterruptedException e) { 761 Log.wtf(TAG, e); 762 } 763 if (Debug.isDebuggerConnected()) { 764 debuggerWasConnected = 2; 765 } 766 timeout = checkIntervalMillis - (SystemClock.uptimeMillis() - start); 767 } 768 769 final int waitState = evaluateCheckerCompletionLocked(); 770 if (waitState == COMPLETED) { 771 // The monitors have returned; reset 772 waitedHalf = false; 773 continue; 774 } else if (waitState == WAITING) { 775 // still waiting but within their configured intervals; back off and recheck 776 continue; 777 } else if (waitState == WAITED_HALF) { 778 if (!waitedHalf) { 779 Slog.i(TAG, "WAITED_HALF"); 780 waitedHalf = true; 781 // We've waited half, but we'd need to do the stack trace dump w/o the lock. 782 blockedCheckers = getCheckersWithStateLocked(WAITED_HALF); 783 subject = describeCheckersLocked(blockedCheckers); 784 pids = new ArrayList<>(mInterestingJavaPids); 785 doWaitedHalfDump = true; 786 } else { 787 continue; 788 } 789 } else { 790 // something is overdue! 791 blockedCheckers = getCheckersWithStateLocked(OVERDUE); 792 subject = describeCheckersLocked(blockedCheckers); 793 allowRestart = mAllowRestart; 794 pids = new ArrayList<>(mInterestingJavaPids); 795 } 796 } // END synchronized (mLock) 797 798 // If we got here, that means that the system is most likely hung. 799 // 800 // First collect stack traces from all threads of the system process. 801 // 802 // Then, if we reached the full timeout, kill this process so that the system will 803 // restart. If we reached half of the timeout, just log some information and continue. 804 logWatchog(doWaitedHalfDump, subject, pids); 805 806 if (doWaitedHalfDump) { 807 // We have waited for only half of the timeout, we continue to wait for the duration 808 // of the full timeout before killing the process. 809 continue; 810 } 811 812 IActivityController controller; 813 synchronized (mLock) { 814 controller = mController; 815 } 816 if (controller != null) { 817 Slog.i(TAG, "Reporting stuck state to activity controller"); 818 try { 819 Binder.setDumpDisabled("Service dumps disabled due to hung system process."); 820 // 1 = keep waiting, -1 = kill system 821 int res = controller.systemNotResponding(subject); 822 if (res >= 0) { 823 Slog.i(TAG, "Activity controller requested to coninue to wait"); 824 waitedHalf = false; 825 continue; 826 } 827 } catch (RemoteException e) { 828 } 829 } 830 831 // Only kill the process if the debugger is not attached. 832 if (Debug.isDebuggerConnected()) { 833 debuggerWasConnected = 2; 834 } 835 if (debuggerWasConnected >= 2) { 836 Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process"); 837 } else if (debuggerWasConnected > 0) { 838 Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process"); 839 } else if (!allowRestart) { 840 Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process"); 841 } else { 842 Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject); 843 WatchdogDiagnostics.diagnoseCheckers(blockedCheckers); 844 Slog.w(TAG, "*** GOODBYE!"); 845 if (!Build.IS_USER && isCrashLoopFound() 846 && !WatchdogProperties.should_ignore_fatal_count().orElse(false)) { 847 breakCrashLoop(); 848 } 849 Process.killProcess(Process.myPid()); 850 System.exit(10); 851 } 852 853 waitedHalf = false; 854 } 855 } 856 logWatchog(boolean halfWatchdog, String subject, ArrayList<Integer> pids)857 private void logWatchog(boolean halfWatchdog, String subject, ArrayList<Integer> pids) { 858 // Get critical event log before logging the half watchdog so that it doesn't 859 // occur in the log. 860 String criticalEvents = 861 CriticalEventLog.getInstance().logLinesForSystemServerTraceFile(); 862 final UUID errorId = mTraceErrorLogger.generateErrorId(); 863 if (mTraceErrorLogger.isAddErrorIdEnabled()) { 864 mTraceErrorLogger.addErrorIdToTrace("system_server", errorId); 865 mTraceErrorLogger.addSubjectToTrace(subject, errorId); 866 } 867 868 final String dropboxTag; 869 if (halfWatchdog) { 870 dropboxTag = "pre_watchdog"; 871 CriticalEventLog.getInstance().logHalfWatchdog(subject); 872 } else { 873 dropboxTag = "watchdog"; 874 CriticalEventLog.getInstance().logWatchdog(subject, errorId); 875 EventLog.writeEvent(EventLogTags.WATCHDOG, subject); 876 // Log the atom as early as possible since it is used as a mechanism to trigger 877 // Perfetto. Ideally, the Perfetto trace capture should happen as close to the 878 // point in time when the Watchdog happens as possible. 879 FrameworkStatsLog.write(FrameworkStatsLog.SYSTEM_SERVER_WATCHDOG_OCCURRED, subject); 880 } 881 882 long anrTime = SystemClock.uptimeMillis(); 883 StringBuilder report = new StringBuilder(); 884 report.append(MemoryPressureUtil.currentPsiState()); 885 ProcessCpuTracker processCpuTracker = new ProcessCpuTracker(false); 886 StringWriter tracesFileException = new StringWriter(); 887 final File stack = ActivityManagerService.dumpStackTraces( 888 pids, processCpuTracker, new SparseArray<>(), getInterestingNativePids(), 889 tracesFileException, subject, criticalEvents); 890 // Give some extra time to make sure the stack traces get written. 891 // The system's been hanging for a whlie, another second or two won't hurt much. 892 SystemClock.sleep(5000); 893 processCpuTracker.update(); 894 report.append(processCpuTracker.printCurrentState(anrTime)); 895 report.append(tracesFileException.getBuffer()); 896 897 if (!halfWatchdog) { 898 // Trigger the kernel to dump all blocked threads, and backtraces on all CPUs to the 899 // kernel log 900 doSysRq('w'); 901 doSysRq('l'); 902 } 903 904 // Try to add the error to the dropbox, but assuming that the ActivityManager 905 // itself may be deadlocked. (which has happened, causing this statement to 906 // deadlock and the watchdog as a whole to be ineffective) 907 Thread dropboxThread = new Thread("watchdogWriteToDropbox") { 908 public void run() { 909 // If a watched thread hangs before init() is called, we don't have a 910 // valid mActivity. So we can't log the error to dropbox. 911 if (mActivity != null) { 912 mActivity.addErrorToDropBox( 913 dropboxTag, null, "system_server", null, null, null, 914 null, report.toString(), stack, null, null, null, 915 errorId); 916 } 917 } 918 }; 919 dropboxThread.start(); 920 try { 921 dropboxThread.join(2000); // wait up to 2 seconds for it to return. 922 } catch (InterruptedException ignored) { } 923 } 924 doSysRq(char c)925 private void doSysRq(char c) { 926 try { 927 FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger"); 928 sysrq_trigger.write(c); 929 sysrq_trigger.close(); 930 } catch (IOException e) { 931 Slog.w(TAG, "Failed to write to /proc/sysrq-trigger", e); 932 } 933 } 934 resetTimeoutHistory()935 private void resetTimeoutHistory() { 936 writeTimeoutHistory(new ArrayList<String>()); 937 } 938 writeTimeoutHistory(Iterable<String> crashHistory)939 private void writeTimeoutHistory(Iterable<String> crashHistory) { 940 String data = String.join(",", crashHistory); 941 942 try (FileWriter writer = new FileWriter(TIMEOUT_HISTORY_FILE)) { 943 writer.write(SystemProperties.get("ro.boottime.zygote")); 944 writer.write(":"); 945 writer.write(data); 946 } catch (IOException e) { 947 Slog.e(TAG, "Failed to write file " + TIMEOUT_HISTORY_FILE, e); 948 } 949 } 950 readTimeoutHistory()951 private String[] readTimeoutHistory() { 952 final String[] emptyStringArray = {}; 953 954 try (BufferedReader reader = new BufferedReader(new FileReader(TIMEOUT_HISTORY_FILE))) { 955 String line = reader.readLine(); 956 if (line == null) { 957 return emptyStringArray; 958 } 959 960 String[] data = line.trim().split(":"); 961 String boottime = data.length >= 1 ? data[0] : ""; 962 String history = data.length >= 2 ? data[1] : ""; 963 if (SystemProperties.get("ro.boottime.zygote").equals(boottime) && !history.isEmpty()) { 964 return history.split(","); 965 } else { 966 return emptyStringArray; 967 } 968 } catch (FileNotFoundException e) { 969 return emptyStringArray; 970 } catch (IOException e) { 971 Slog.e(TAG, "Failed to read file " + TIMEOUT_HISTORY_FILE, e); 972 return emptyStringArray; 973 } 974 } 975 hasActiveUsbConnection()976 private boolean hasActiveUsbConnection() { 977 try { 978 final String state = FileUtils.readTextFile( 979 new File("/sys/class/android_usb/android0/state"), 980 128 /*max*/, null /*ellipsis*/).trim(); 981 if ("CONFIGURED".equals(state)) { 982 return true; 983 } 984 } catch (IOException e) { 985 Slog.w(TAG, "Failed to determine if device was on USB", e); 986 } 987 return false; 988 } 989 isCrashLoopFound()990 private boolean isCrashLoopFound() { 991 int fatalCount = WatchdogProperties.fatal_count().orElse(0); 992 long fatalWindowMs = TimeUnit.SECONDS.toMillis( 993 WatchdogProperties.fatal_window_seconds().orElse(0)); 994 if (fatalCount == 0 || fatalWindowMs == 0) { 995 if (fatalCount != fatalWindowMs) { 996 Slog.w(TAG, String.format("sysprops '%s' and '%s' should be set or unset together", 997 PROP_FATAL_LOOP_COUNT, PROP_FATAL_LOOP_WINDOWS_SECS)); 998 } 999 return false; 1000 } 1001 1002 // new-history = [last (fatalCount - 1) items in old-history] + [nowMs]. 1003 long nowMs = SystemClock.elapsedRealtime(); // Time since boot including deep sleep. 1004 String[] rawCrashHistory = readTimeoutHistory(); 1005 ArrayList<String> crashHistory = new ArrayList<String>(Arrays.asList(Arrays.copyOfRange( 1006 rawCrashHistory, 1007 Math.max(0, rawCrashHistory.length - fatalCount - 1), 1008 rawCrashHistory.length))); 1009 // Something wrong here. 1010 crashHistory.add(String.valueOf(nowMs)); 1011 writeTimeoutHistory(crashHistory); 1012 1013 // Returns false if the device has an active USB connection. 1014 if (hasActiveUsbConnection()) { 1015 return false; 1016 } 1017 1018 long firstCrashMs; 1019 try { 1020 firstCrashMs = Long.parseLong(crashHistory.get(0)); 1021 } catch (NumberFormatException t) { 1022 Slog.w(TAG, "Failed to parseLong " + crashHistory.get(0), t); 1023 resetTimeoutHistory(); 1024 return false; 1025 } 1026 return crashHistory.size() >= fatalCount && nowMs - firstCrashMs < fatalWindowMs; 1027 } 1028 breakCrashLoop()1029 private void breakCrashLoop() { 1030 try (FileWriter kmsg = new FileWriter("/dev/kmsg_debug", /* append= */ true)) { 1031 kmsg.append("Fatal reset to escape the system_server crashing loop\n"); 1032 } catch (IOException e) { 1033 Slog.w(TAG, "Failed to append to kmsg", e); 1034 } 1035 doSysRq('c'); 1036 } 1037 1038 @Override dump(@onNull PrintWriter pw, @Nullable String[] args)1039 public void dump(@NonNull PrintWriter pw, @Nullable String[] args) { 1040 pw.print("WatchdogTimeoutMillis="); 1041 pw.println(mWatchdogTimeoutMillis); 1042 } 1043 } 1044