1 /* 2 * Copyright (C) 2008 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.server; 18 19 import static com.android.server.Watchdog.HandlerCheckerAndTimeout.withCustomTimeout; 20 import static com.android.server.Watchdog.HandlerCheckerAndTimeout.withDefaultTimeout; 21 22 import android.annotation.NonNull; 23 import android.annotation.Nullable; 24 import android.app.IActivityController; 25 import android.content.BroadcastReceiver; 26 import android.content.Context; 27 import android.content.Intent; 28 import android.content.IntentFilter; 29 import android.database.ContentObserver; 30 import android.hidl.manager.V1_0.IServiceManager; 31 import android.net.Uri; 32 import android.os.Binder; 33 import android.os.Build; 34 import android.os.Debug; 35 import android.os.FileUtils; 36 import android.os.Handler; 37 import android.os.IPowerManager; 38 import android.os.Looper; 39 import android.os.Process; 40 import android.os.RemoteException; 41 import android.os.ServiceDebugInfo; 42 import android.os.ServiceManager; 43 import android.os.SystemClock; 44 import android.os.SystemProperties; 45 import android.os.UserHandle; 46 import android.provider.Settings; 47 import android.sysprop.WatchdogProperties; 48 import android.util.Dumpable; 49 import android.util.EventLog; 50 import android.util.Log; 51 import android.util.Slog; 52 import android.util.SparseBooleanArray; 53 54 import com.android.internal.os.BackgroundThread; 55 import com.android.internal.os.ProcessCpuTracker; 56 import com.android.internal.os.ZygoteConnectionConstants; 57 import com.android.internal.util.FrameworkStatsLog; 58 import com.android.server.am.ActivityManagerService; 59 import com.android.server.am.StackTracesDumpHelper; 60 import com.android.server.am.TraceErrorLogger; 61 import com.android.server.criticalevents.CriticalEventLog; 62 import com.android.server.wm.SurfaceAnimationThread; 63 64 import java.io.BufferedReader; 65 import java.io.File; 66 import java.io.FileNotFoundException; 67 import java.io.FileReader; 68 import java.io.FileWriter; 69 import java.io.IOException; 70 import java.io.PrintWriter; 71 import java.io.StringWriter; 72 import java.time.Clock; 73 import java.util.ArrayList; 74 import java.util.Arrays; 75 import java.util.Collections; 76 import java.util.HashSet; 77 import java.util.LinkedHashMap; 78 import java.util.List; 79 import java.util.Optional; 80 import java.util.UUID; 81 import java.util.concurrent.CompletableFuture; 82 import java.util.concurrent.TimeUnit; 83 84 /** 85 * This class calls its monitor every minute. Killing this process if they don't return 86 **/ 87 public class Watchdog implements Dumpable { 88 static final String TAG = "Watchdog"; 89 90 /** Debug flag. */ 91 public static final boolean DEBUG = false; 92 93 // Set this to true to use debug default values. 94 private static final boolean DB = false; 95 96 // Note 1: Do not lower this value below thirty seconds without tightening the invoke-with 97 // timeout in com.android.internal.os.ZygoteConnection, or wrapped applications 98 // can trigger the watchdog. 99 // Note 2: The debug value is already below the wait time in ZygoteConnection. Wrapped 100 // applications may not work with a debug build. CTS will fail. 101 private static final long DEFAULT_TIMEOUT = DB ? 10 * 1000 : 60 * 1000; 102 103 // This ratio is used to compute the pre-watchdog timeout (2 means that the pre-watchdog timeout 104 // will be half the full timeout). 105 // 106 // The pre-watchdog event is similar to a full watchdog except it does not crash system server. 107 private static final int PRE_WATCHDOG_TIMEOUT_RATIO = 4; 108 109 // These are temporally ordered: larger values as lateness increases 110 static final int COMPLETED = 0; 111 static final int WAITING = 1; 112 static final int WAITED_UNTIL_PRE_WATCHDOG = 2; 113 static final int OVERDUE = 3; 114 115 // Track watchdog timeout history and break the crash loop if there is. 116 private static final String TIMEOUT_HISTORY_FILE = "/data/system/watchdog-timeout-history.txt"; 117 private static final String PROP_FATAL_LOOP_COUNT = "framework_watchdog.fatal_count"; 118 private static final String PROP_FATAL_LOOP_WINDOWS_SECS = 119 "framework_watchdog.fatal_window.second"; 120 121 // Which native processes to dump into dropbox's stack traces 122 public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] { 123 "/system/bin/audioserver", 124 "/system/bin/cameraserver", 125 "/system/bin/drmserver", 126 "/system/bin/keystore2", 127 "/system/bin/mediadrmserver", 128 "/system/bin/mediaserver", 129 "/system/bin/netd", 130 "/system/bin/sdcard", 131 "/system/bin/servicemanager", 132 "/system/bin/surfaceflinger", 133 "/system/bin/vold", 134 "media.extractor", // system/bin/mediaextractor 135 "media.metrics", // system/bin/mediametrics 136 "media.codec", // vendor/bin/hw/android.hardware.media.omx@1.0-service 137 "media.swcodec", // /apex/com.android.media.swcodec/bin/mediaswcodec 138 "media.transcoding", // Media transcoding service 139 "com.android.bluetooth", // Bluetooth service 140 "/apex/com.android.art/bin/artd", // ART daemon 141 "/apex/com.android.os.statsd/bin/statsd", // Stats daemon 142 }; 143 144 public static final List<String> HAL_INTERFACES_OF_INTEREST = Arrays.asList( 145 "android.hardware.audio@4.0::IDevicesFactory", 146 "android.hardware.audio@5.0::IDevicesFactory", 147 "android.hardware.audio@6.0::IDevicesFactory", 148 "android.hardware.audio@7.0::IDevicesFactory", 149 "android.hardware.biometrics.face@1.0::IBiometricsFace", 150 "android.hardware.biometrics.fingerprint@2.1::IBiometricsFingerprint", 151 "android.hardware.bluetooth@1.0::IBluetoothHci", 152 "android.hardware.camera.provider@2.4::ICameraProvider", 153 "android.hardware.gnss@1.0::IGnss", 154 "android.hardware.graphics.allocator@2.0::IAllocator", 155 "android.hardware.graphics.allocator@4.0::IAllocator", 156 "android.hardware.graphics.composer@2.1::IComposer", 157 "android.hardware.health@2.0::IHealth", 158 "android.hardware.light@2.0::ILight", 159 "android.hardware.media.c2@1.0::IComponentStore", 160 "android.hardware.media.omx@1.0::IOmx", 161 "android.hardware.media.omx@1.0::IOmxStore", 162 "android.hardware.neuralnetworks@1.0::IDevice", 163 "android.hardware.power@1.0::IPower", 164 "android.hardware.power.stats@1.0::IPowerStats", 165 "android.hardware.sensors@1.0::ISensors", 166 "android.hardware.sensors@2.0::ISensors", 167 "android.hardware.sensors@2.1::ISensors", 168 "android.hardware.vr@1.0::IVr", 169 "android.system.suspend@1.0::ISystemSuspend" 170 ); 171 172 public static final String[] AIDL_INTERFACE_PREFIXES_OF_INTEREST = new String[] { 173 "android.hardware.audio.core.IModule/", 174 "android.hardware.audio.core.IConfig/", 175 "android.hardware.audio.effect.IFactory/", 176 "android.hardware.biometrics.face.IFace/", 177 "android.hardware.biometrics.fingerprint.IFingerprint/", 178 "android.hardware.bluetooth.IBluetoothHci/", 179 "android.hardware.camera.provider.ICameraProvider/", 180 "android.hardware.drm.IDrmFactory/", 181 "android.hardware.gnss.IGnss/", 182 "android.hardware.graphics.allocator.IAllocator/", 183 "android.hardware.graphics.composer3.IComposer/", 184 "android.hardware.health.IHealth/", 185 "android.hardware.input.processor.IInputProcessor/", 186 "android.hardware.light.ILights/", 187 "android.hardware.neuralnetworks.IDevice/", 188 "android.hardware.power.IPower/", 189 "android.hardware.power.stats.IPowerStats/", 190 "android.hardware.sensors.ISensors/", 191 "android.hardware.vibrator.IVibrator/", 192 "android.hardware.vibrator.IVibratorManager/", 193 "android.hardware.wifi.hostapd.IHostapd/", 194 "android.hardware.wifi.IWifi/", 195 "android.hardware.wifi.supplicant.ISupplicant/", 196 "android.system.suspend.ISystemSuspend/", 197 }; 198 199 private static Watchdog sWatchdog; 200 201 private final Thread mThread; 202 203 private final Object mLock = new Object(); 204 205 /* This handler will be used to post message back onto the main thread */ 206 private final ArrayList<HandlerCheckerAndTimeout> mHandlerCheckers = new ArrayList<>(); 207 private final HandlerChecker mMonitorChecker; 208 private ActivityManagerService mActivity; 209 private IActivityController mController; 210 private boolean mAllowRestart = true; 211 // We start with DEFAULT_TIMEOUT. This will then be update with the timeout values from Settings 212 // once the settings provider is initialized. 213 private volatile long mWatchdogTimeoutMillis = DEFAULT_TIMEOUT; 214 private final List<Integer> mInterestingJavaPids = new ArrayList<>(); 215 private final TraceErrorLogger mTraceErrorLogger; 216 217 /** Holds a checker and its timeout. */ 218 static final class HandlerCheckerAndTimeout { 219 private final HandlerChecker mHandler; 220 private final Optional<Long> mCustomTimeoutMillis; 221 HandlerCheckerAndTimeout(HandlerChecker checker, Optional<Long> timeoutMillis)222 private HandlerCheckerAndTimeout(HandlerChecker checker, Optional<Long> timeoutMillis) { 223 this.mHandler = checker; 224 this.mCustomTimeoutMillis = timeoutMillis; 225 } 226 checker()227 HandlerChecker checker() { 228 return mHandler; 229 } 230 231 /** Returns the timeout. */ customTimeoutMillis()232 Optional<Long> customTimeoutMillis() { 233 return mCustomTimeoutMillis; 234 } 235 236 /** 237 * Creates a checker with the default timeout. The timeout will use the default value which 238 * is configurable server-side. 239 */ withDefaultTimeout(HandlerChecker checker)240 static HandlerCheckerAndTimeout withDefaultTimeout(HandlerChecker checker) { 241 return new HandlerCheckerAndTimeout(checker, Optional.empty()); 242 } 243 244 /** 245 * Creates a checker with a custom timeout. The timeout overrides the default value and will 246 * always be used. 247 */ withCustomTimeout( HandlerChecker checker, long timeoutMillis)248 static HandlerCheckerAndTimeout withCustomTimeout( 249 HandlerChecker checker, long timeoutMillis) { 250 return new HandlerCheckerAndTimeout(checker, Optional.of(timeoutMillis)); 251 } 252 } 253 254 /** Used for checking status of handle threads and scheduling monitor callbacks. */ 255 public static class HandlerChecker implements Runnable { 256 private final Handler mHandler; 257 private final String mName; 258 private final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>(); 259 private final ArrayList<Monitor> mMonitorQueue = new ArrayList<Monitor>(); 260 private long mWaitMaxMillis; 261 private boolean mCompleted; 262 private Monitor mCurrentMonitor; 263 private long mStartTimeMillis; 264 private int mPauseCount; 265 private long mPauseEndTimeMillis; 266 private Clock mClock; 267 private Object mLock; 268 HandlerChecker(Handler handler, String name, Object lock, Clock clock)269 HandlerChecker(Handler handler, String name, Object lock, Clock clock) { 270 mHandler = handler; 271 mName = name; 272 mLock = lock; 273 mCompleted = true; 274 mClock = clock; 275 } 276 HandlerChecker(Handler handler, String name, Object lock)277 HandlerChecker(Handler handler, String name, Object lock) { 278 this(handler, name, lock, SystemClock.uptimeClock()); 279 } 280 addMonitorLocked(Monitor monitor)281 void addMonitorLocked(Monitor monitor) { 282 // We don't want to update mMonitors when the Handler is in the middle of checking 283 // all monitors. We will update mMonitors on the next schedule if it is safe 284 mMonitorQueue.add(monitor); 285 } 286 287 /** 288 * Schedules a run on the handler thread. 289 * 290 * @param handlerCheckerTimeoutMillis the timeout to use for this run 291 */ scheduleCheckLocked(long handlerCheckerTimeoutMillis)292 public void scheduleCheckLocked(long handlerCheckerTimeoutMillis) { 293 mWaitMaxMillis = handlerCheckerTimeoutMillis; 294 295 if (mCompleted && !mMonitorQueue.isEmpty()) { 296 // Safe to update monitors in queue, Handler is not in the middle of work 297 mMonitors.addAll(mMonitorQueue); 298 mMonitorQueue.clear(); 299 } 300 301 long nowMillis = mClock.millis(); 302 boolean isPaused = mPauseCount > 0 || mPauseEndTimeMillis > nowMillis; 303 if ((mMonitors.size() == 0 && isHandlerPolling()) || isPaused) { 304 // Don't schedule until after resume OR 305 // If the target looper has recently been polling, then 306 // there is no reason to enqueue our checker on it since that 307 // is as good as it not being deadlocked. This avoid having 308 // to do a context switch to check the thread. Note that we 309 // only do this if we have no monitors since those would need to 310 // be executed at this point. 311 mCompleted = true; 312 return; 313 } 314 if (!mCompleted) { 315 // we already have a check in flight, so no need 316 return; 317 } 318 319 mCompleted = false; 320 mCurrentMonitor = null; 321 mStartTimeMillis = nowMillis; 322 mPauseEndTimeMillis = 0; 323 mHandler.postAtFrontOfQueue(this); 324 } 325 isHandlerPolling()326 boolean isHandlerPolling() { 327 return mHandler.getLooper().getQueue().isPolling(); 328 } 329 getCompletionStateLocked()330 public int getCompletionStateLocked() { 331 if (mCompleted) { 332 return COMPLETED; 333 } else { 334 long latency = mClock.millis() - mStartTimeMillis; 335 if (latency < mWaitMaxMillis / PRE_WATCHDOG_TIMEOUT_RATIO) { 336 return WAITING; 337 } else if (latency < mWaitMaxMillis) { 338 return WAITED_UNTIL_PRE_WATCHDOG; 339 } 340 } 341 return OVERDUE; 342 } 343 getThread()344 public Thread getThread() { 345 return mHandler.getLooper().getThread(); 346 } 347 getName()348 public String getName() { 349 return mName; 350 } 351 describeBlockedStateLocked()352 String describeBlockedStateLocked() { 353 final String prefix; 354 if (mCurrentMonitor == null) { 355 prefix = "Blocked in handler"; 356 } else { 357 prefix = "Blocked in monitor " + mCurrentMonitor.getClass().getName(); 358 } 359 long latencySeconds = (mClock.millis() - mStartTimeMillis) / 1000; 360 return prefix + " on " + mName + " (" + getThread().getName() + ")" 361 + " for " + latencySeconds + "s"; 362 } 363 364 @Override run()365 public void run() { 366 // Once we get here, we ensure that mMonitors does not change even if we call 367 // #addMonitorLocked because we first add the new monitors to mMonitorQueue and 368 // move them to mMonitors on the next schedule when mCompleted is true, at which 369 // point we have completed execution of this method. 370 final int size = mMonitors.size(); 371 for (int i = 0 ; i < size ; i++) { 372 synchronized (mLock) { 373 mCurrentMonitor = mMonitors.get(i); 374 } 375 mCurrentMonitor.monitor(); 376 } 377 378 synchronized (mLock) { 379 mCompleted = true; 380 mCurrentMonitor = null; 381 } 382 } 383 384 /** 385 * Pauses the checks for the given time. 386 * 387 * <p>The current run will be ignored and another run will be scheduled after 388 * the given time. 389 */ pauseForLocked(int pauseMillis, String reason)390 public void pauseForLocked(int pauseMillis, String reason) { 391 mPauseEndTimeMillis = mClock.millis() + pauseMillis; 392 // Mark as completed, because there's a chance we called this after the watchog 393 // thread loop called Object#wait after 'WAITED_UNTIL_PRE_WATCHDOG'. In that case we 394 // want to ensure the next call to #getCompletionStateLocked for this checker returns 395 // 'COMPLETED' 396 mCompleted = true; 397 Slog.i(TAG, "Pausing of HandlerChecker: " + mName + " for reason: " 398 + reason + ". Pause end time: " + mPauseEndTimeMillis); 399 } 400 401 /** Pause the HandlerChecker. */ pauseLocked(String reason)402 public void pauseLocked(String reason) { 403 mPauseCount++; 404 // Mark as completed, because there's a chance we called this after the watchog 405 // thread loop called Object#wait after 'WAITED_UNTIL_PRE_WATCHDOG'. In that case we 406 // want to ensure the next call to #getCompletionStateLocked for this checker returns 407 // 'COMPLETED' 408 mCompleted = true; 409 Slog.i(TAG, "Pausing HandlerChecker: " + mName + " for reason: " 410 + reason + ". Pause count: " + mPauseCount); 411 } 412 413 /** Resume the HandlerChecker from the last {@link #pauseLocked}. */ resumeLocked(String reason)414 public void resumeLocked(String reason) { 415 if (mPauseCount > 0) { 416 mPauseCount--; 417 Slog.i(TAG, "Resuming HandlerChecker: " + mName + " for reason: " 418 + reason + ". Pause count: " + mPauseCount); 419 } else { 420 Slog.wtf(TAG, "Already resumed HandlerChecker: " + mName); 421 } 422 } 423 424 @Override toString()425 public String toString() { 426 return "CheckerHandler for " + mName; 427 } 428 } 429 430 final class RebootRequestReceiver extends BroadcastReceiver { 431 @Override onReceive(Context c, Intent intent)432 public void onReceive(Context c, Intent intent) { 433 if (intent.getIntExtra("nowait", 0) != 0) { 434 rebootSystem("Received ACTION_REBOOT broadcast"); 435 return; 436 } 437 Slog.w(TAG, "Unsupported ACTION_REBOOT broadcast: " + intent); 438 } 439 } 440 441 /** Monitor for checking the availability of binder threads. The monitor will block until 442 * there is a binder thread available to process in coming IPCs to make sure other processes 443 * can still communicate with the service. 444 */ 445 private static final class BinderThreadMonitor implements Watchdog.Monitor { 446 @Override monitor()447 public void monitor() { 448 Binder.blockUntilThreadAvailable(); 449 } 450 } 451 452 public interface Monitor { monitor()453 void monitor(); 454 } 455 getInstance()456 public static Watchdog getInstance() { 457 if (sWatchdog == null) { 458 sWatchdog = new Watchdog(); 459 } 460 461 return sWatchdog; 462 } 463 Watchdog()464 private Watchdog() { 465 mThread = new Thread(this::run, "watchdog"); 466 467 // Initialize handler checkers for each common thread we want to check. Note 468 // that we are not currently checking the background thread, since it can 469 // potentially hold longer running operations with no guarantees about the timeliness 470 // of operations there. 471 // 472 // Use a custom thread to check monitors to avoid lock contention from impacted other 473 // threads. 474 ServiceThread t = new ServiceThread("watchdog.monitor", 475 android.os.Process.THREAD_PRIORITY_DEFAULT, true /*allowIo*/); 476 t.start(); 477 mMonitorChecker = new HandlerChecker(new Handler(t.getLooper()), "monitor thread", mLock); 478 mHandlerCheckers.add(withDefaultTimeout(mMonitorChecker)); 479 480 mHandlerCheckers.add( 481 withDefaultTimeout( 482 new HandlerChecker(FgThread.getHandler(), "foreground thread", mLock))); 483 // Add checker for main thread. We only do a quick check since there 484 // can be UI running on the thread. 485 mHandlerCheckers.add( 486 withDefaultTimeout( 487 new HandlerChecker( 488 new Handler(Looper.getMainLooper()), "main thread", mLock))); 489 // Add checker for shared UI thread. 490 mHandlerCheckers.add( 491 withDefaultTimeout(new HandlerChecker(UiThread.getHandler(), "ui thread", mLock))); 492 // And also check IO thread. 493 mHandlerCheckers.add( 494 withDefaultTimeout(new HandlerChecker(IoThread.getHandler(), "i/o thread", mLock))); 495 // And the display thread. 496 mHandlerCheckers.add( 497 withDefaultTimeout( 498 new HandlerChecker(DisplayThread.getHandler(), "display thread", mLock))); 499 // And the animation thread. 500 mHandlerCheckers.add( 501 withDefaultTimeout( 502 new HandlerChecker( 503 AnimationThread.getHandler(), "animation thread", mLock))); 504 // And the surface animation thread. 505 mHandlerCheckers.add( 506 withDefaultTimeout( 507 new HandlerChecker( 508 SurfaceAnimationThread.getHandler(), 509 "surface animation thread", 510 mLock))); 511 // Initialize monitor for Binder threads. 512 addMonitor(new BinderThreadMonitor()); 513 514 mInterestingJavaPids.add(Process.myPid()); 515 516 // See the notes on DEFAULT_TIMEOUT. 517 assert DB || Build.IS_USERDEBUG || 518 DEFAULT_TIMEOUT > ZygoteConnectionConstants.WRAPPED_PID_TIMEOUT_MILLIS; 519 520 mTraceErrorLogger = new TraceErrorLogger(); 521 } 522 523 /** 524 * Called by SystemServer to cause the internal thread to begin execution. 525 */ start()526 public void start() { 527 mThread.start(); 528 } 529 530 /** 531 * Registers a {@link BroadcastReceiver} to listen to reboot broadcasts and trigger reboot. 532 * Should be called during boot after the ActivityManagerService is up and registered 533 * as a system service so it can handle registration of a {@link BroadcastReceiver}. 534 */ init(Context context, ActivityManagerService activity)535 public void init(Context context, ActivityManagerService activity) { 536 mActivity = activity; 537 context.registerReceiver(new RebootRequestReceiver(), 538 new IntentFilter(Intent.ACTION_REBOOT), 539 android.Manifest.permission.REBOOT, null); 540 } 541 542 private static class SettingsObserver extends ContentObserver { 543 private final Uri mUri = Settings.Global.getUriFor(Settings.Global.WATCHDOG_TIMEOUT_MILLIS); 544 private final Context mContext; 545 private final Watchdog mWatchdog; 546 SettingsObserver(Context context, Watchdog watchdog)547 SettingsObserver(Context context, Watchdog watchdog) { 548 super(BackgroundThread.getHandler()); 549 mContext = context; 550 mWatchdog = watchdog; 551 // Always kick once to ensure that we match current state 552 onChange(); 553 } 554 555 @Override onChange(boolean selfChange, Uri uri, int userId)556 public void onChange(boolean selfChange, Uri uri, int userId) { 557 if (mUri.equals(uri)) { 558 onChange(); 559 } 560 } 561 onChange()562 public void onChange() { 563 try { 564 mWatchdog.updateWatchdogTimeout(Settings.Global.getLong( 565 mContext.getContentResolver(), 566 Settings.Global.WATCHDOG_TIMEOUT_MILLIS, DEFAULT_TIMEOUT)); 567 } catch (RuntimeException e) { 568 Slog.e(TAG, "Exception while reading settings " + e.getMessage(), e); 569 } 570 } 571 } 572 573 /** 574 * Register an observer to listen to settings. 575 * 576 * It needs to be called after the settings service is initialized. 577 */ registerSettingsObserver(Context context)578 public void registerSettingsObserver(Context context) { 579 context.getContentResolver().registerContentObserver( 580 Settings.Global.getUriFor(Settings.Global.WATCHDOG_TIMEOUT_MILLIS), 581 false, 582 new SettingsObserver(context, this), 583 UserHandle.USER_SYSTEM); 584 } 585 586 /** 587 * Updates watchdog timeout values. 588 */ updateWatchdogTimeout(long timeoutMillis)589 void updateWatchdogTimeout(long timeoutMillis) { 590 // See the notes on DEFAULT_TIMEOUT. 591 if (!DB && !Build.IS_USERDEBUG 592 && timeoutMillis <= ZygoteConnectionConstants.WRAPPED_PID_TIMEOUT_MILLIS) { 593 timeoutMillis = ZygoteConnectionConstants.WRAPPED_PID_TIMEOUT_MILLIS + 1; 594 } 595 mWatchdogTimeoutMillis = timeoutMillis; 596 Slog.i(TAG, "Watchdog timeout updated to " + mWatchdogTimeoutMillis + " millis"); 597 } 598 isInterestingJavaProcess(String processName)599 private static boolean isInterestingJavaProcess(String processName) { 600 return processName.equals(StorageManagerService.sMediaStoreAuthorityProcessName) 601 || processName.equals("com.android.phone"); 602 } 603 604 /** 605 * Notifies the watchdog when a Java process with {@code pid} is started. 606 * This process may have its stack trace dumped during an ANR. 607 */ processStarted(String processName, int pid)608 public void processStarted(String processName, int pid) { 609 if (isInterestingJavaProcess(processName)) { 610 Slog.i(TAG, "Interesting Java process " + processName + " started. Pid " + pid); 611 synchronized (mLock) { 612 mInterestingJavaPids.add(pid); 613 } 614 } 615 } 616 617 /** 618 * Notifies the watchdog when a Java process with {@code pid} dies. 619 */ processDied(String processName, int pid)620 public void processDied(String processName, int pid) { 621 if (isInterestingJavaProcess(processName)) { 622 Slog.i(TAG, "Interesting Java process " + processName + " died. Pid " + pid); 623 synchronized (mLock) { 624 mInterestingJavaPids.remove(Integer.valueOf(pid)); 625 } 626 } 627 } 628 setActivityController(IActivityController controller)629 public void setActivityController(IActivityController controller) { 630 synchronized (mLock) { 631 mController = controller; 632 } 633 } 634 setAllowRestart(boolean allowRestart)635 public void setAllowRestart(boolean allowRestart) { 636 synchronized (mLock) { 637 mAllowRestart = allowRestart; 638 } 639 } 640 addMonitor(Monitor monitor)641 public void addMonitor(Monitor monitor) { 642 synchronized (mLock) { 643 mMonitorChecker.addMonitorLocked(monitor); 644 } 645 } 646 addThread(Handler thread)647 public void addThread(Handler thread) { 648 synchronized (mLock) { 649 final String name = thread.getLooper().getThread().getName(); 650 mHandlerCheckers.add(withDefaultTimeout(new HandlerChecker(thread, name, mLock))); 651 } 652 } 653 addThread(Handler thread, long timeoutMillis)654 public void addThread(Handler thread, long timeoutMillis) { 655 synchronized (mLock) { 656 final String name = thread.getLooper().getThread().getName(); 657 mHandlerCheckers.add( 658 withCustomTimeout(new HandlerChecker(thread, name, mLock), timeoutMillis)); 659 } 660 } 661 662 /** 663 * Pauses the checks of the watchdog for this thread. This is useful 664 * to run a slow operation on one of the monitored thread. 665 * 666 * <p>After the given time, the timeout will go back to the default value. 667 * <p>This method does not require resume to be called. 668 */ pauseWatchingCurrentThreadFor(int pauseMillis, String reason)669 public void pauseWatchingCurrentThreadFor(int pauseMillis, String reason) { 670 synchronized (mLock) { 671 for (HandlerCheckerAndTimeout hc : mHandlerCheckers) { 672 HandlerChecker checker = hc.checker(); 673 if (Thread.currentThread().equals(checker.getThread())) { 674 checker.pauseForLocked(pauseMillis, reason); 675 } 676 } 677 } 678 } 679 680 /** 681 * Pauses the checks of the watchdog for the monitor thread for the given time 682 * 683 * <p>Similar to {@link pauseWatchingCurrentThreadFor} but used for monitors added 684 * through {@link #addMonitor} 685 * <p>This method does not require resume to be called. 686 */ pauseWatchingMonitorsFor(int pauseMillis, String reason)687 public void pauseWatchingMonitorsFor(int pauseMillis, String reason) { 688 mMonitorChecker.pauseForLocked(pauseMillis, reason); 689 } 690 691 /** 692 * Pauses Watchdog action for the currently running thread. Useful before executing long running 693 * operations that could falsely trigger the watchdog. Each call to this will require a matching 694 * call to {@link #resumeWatchingCurrentThread}. 695 * 696 * <p>If the current thread has not been added to the Watchdog, this call is a no-op. 697 * 698 * <p>If the Watchdog is already paused for the current thread, this call adds 699 * adds another pause and will require an additional {@link #resumeCurrentThread} to resume. 700 * 701 * <p>Note: Use with care, as any deadlocks on the current thread will be undetected until all 702 * pauses have been resumed. Prefer to use #pauseWatchingCurrentThreadFor. 703 */ pauseWatchingCurrentThread(String reason)704 public void pauseWatchingCurrentThread(String reason) { 705 synchronized (mLock) { 706 for (HandlerCheckerAndTimeout hc : mHandlerCheckers) { 707 HandlerChecker checker = hc.checker(); 708 if (Thread.currentThread().equals(checker.getThread())) { 709 checker.pauseLocked(reason); 710 } 711 } 712 } 713 } 714 715 /** 716 * Resumes the last pause from {@link #pauseWatchingCurrentThread} for the currently running 717 * thread. 718 * 719 * <p>If the current thread has not been added to the Watchdog, this call is a no-op. 720 * 721 * <p>If the Watchdog action for the current thread is already resumed, this call logs a wtf. 722 * 723 * <p>If all pauses have been resumed, the Watchdog action is finally resumed, otherwise, 724 * the Watchdog action for the current thread remains paused until resume is called at least 725 * as many times as the calls to pause. 726 */ resumeWatchingCurrentThread(String reason)727 public void resumeWatchingCurrentThread(String reason) { 728 synchronized (mLock) { 729 for (HandlerCheckerAndTimeout hc : mHandlerCheckers) { 730 HandlerChecker checker = hc.checker(); 731 if (Thread.currentThread().equals(checker.getThread())) { 732 checker.resumeLocked(reason); 733 } 734 } 735 } 736 } 737 738 /** 739 * Perform a full reboot of the system. 740 */ rebootSystem(String reason)741 void rebootSystem(String reason) { 742 Slog.i(TAG, "Rebooting system because: " + reason); 743 IPowerManager pms = (IPowerManager)ServiceManager.getService(Context.POWER_SERVICE); 744 try { 745 pms.reboot(false, reason, false); 746 } catch (RemoteException ex) { 747 } 748 } 749 evaluateCheckerCompletionLocked()750 private int evaluateCheckerCompletionLocked() { 751 int state = COMPLETED; 752 for (int i=0; i<mHandlerCheckers.size(); i++) { 753 HandlerChecker hc = mHandlerCheckers.get(i).checker(); 754 state = Math.max(state, hc.getCompletionStateLocked()); 755 } 756 return state; 757 } 758 getCheckersWithStateLocked(int completionState)759 private ArrayList<HandlerChecker> getCheckersWithStateLocked(int completionState) { 760 ArrayList<HandlerChecker> checkers = new ArrayList<HandlerChecker>(); 761 for (int i=0; i<mHandlerCheckers.size(); i++) { 762 HandlerChecker hc = mHandlerCheckers.get(i).checker(); 763 if (hc.getCompletionStateLocked() == completionState) { 764 checkers.add(hc); 765 } 766 } 767 return checkers; 768 } 769 describeCheckersLocked(List<HandlerChecker> checkers)770 private String describeCheckersLocked(List<HandlerChecker> checkers) { 771 StringBuilder builder = new StringBuilder(128); 772 for (int i=0; i<checkers.size(); i++) { 773 if (builder.length() > 0) { 774 builder.append(", "); 775 } 776 builder.append(checkers.get(i).describeBlockedStateLocked()); 777 } 778 return builder.toString(); 779 } 780 addInterestingHidlPids(HashSet<Integer> pids)781 private static void addInterestingHidlPids(HashSet<Integer> pids) { 782 try { 783 IServiceManager serviceManager = IServiceManager.getService(); 784 ArrayList<IServiceManager.InstanceDebugInfo> dump = 785 serviceManager.debugDump(); 786 for (IServiceManager.InstanceDebugInfo info : dump) { 787 if (info.pid == IServiceManager.PidConstant.NO_PID) { 788 continue; 789 } 790 791 if (!HAL_INTERFACES_OF_INTEREST.contains(info.interfaceName)) { 792 continue; 793 } 794 795 pids.add(info.pid); 796 } 797 } catch (RemoteException e) { 798 Log.w(TAG, e); 799 } 800 } 801 addInterestingAidlPids(HashSet<Integer> pids)802 private static void addInterestingAidlPids(HashSet<Integer> pids) { 803 ServiceDebugInfo[] infos = ServiceManager.getServiceDebugInfo(); 804 if (infos == null) return; 805 806 for (ServiceDebugInfo info : infos) { 807 for (String prefix : AIDL_INTERFACE_PREFIXES_OF_INTEREST) { 808 if (info.name.startsWith(prefix)) { 809 pids.add(info.debugPid); 810 } 811 } 812 } 813 } 814 getInterestingNativePids()815 static ArrayList<Integer> getInterestingNativePids() { 816 HashSet<Integer> pids = new HashSet<>(); 817 addInterestingAidlPids(pids); 818 addInterestingHidlPids(pids); 819 820 int[] nativePids = Process.getPidsForCommands(NATIVE_STACKS_OF_INTEREST); 821 if (nativePids != null) { 822 for (int i : nativePids) { 823 pids.add(i); 824 } 825 } 826 827 return new ArrayList<Integer>(pids); 828 } 829 run()830 private void run() { 831 boolean waitedHalf = false; 832 833 while (true) { 834 List<HandlerChecker> blockedCheckers = Collections.emptyList(); 835 String subject = ""; 836 boolean allowRestart = true; 837 int debuggerWasConnected = 0; 838 boolean doWaitedPreDump = false; 839 // The value of mWatchdogTimeoutMillis might change while we are executing the loop. 840 // We store the current value to use a consistent value for all handlers. 841 final long watchdogTimeoutMillis = mWatchdogTimeoutMillis; 842 final long checkIntervalMillis = watchdogTimeoutMillis / PRE_WATCHDOG_TIMEOUT_RATIO; 843 final ArrayList<Integer> pids; 844 synchronized (mLock) { 845 long timeout = checkIntervalMillis; 846 // Make sure we (re)spin the checkers that have become idle within 847 // this wait-and-check interval 848 for (int i=0; i<mHandlerCheckers.size(); i++) { 849 HandlerCheckerAndTimeout hc = mHandlerCheckers.get(i); 850 // We pick the watchdog to apply every time we reschedule the checkers. The 851 // default timeout might have changed since the last run. 852 hc.checker().scheduleCheckLocked(hc.customTimeoutMillis() 853 .orElse(watchdogTimeoutMillis * Build.HW_TIMEOUT_MULTIPLIER)); 854 } 855 856 if (debuggerWasConnected > 0) { 857 debuggerWasConnected--; 858 } 859 860 // NOTE: We use uptimeMillis() here because we do not want to increment the time we 861 // wait while asleep. If the device is asleep then the thing that we are waiting 862 // to timeout on is asleep as well and won't have a chance to run, causing a false 863 // positive on when to kill things. 864 long start = SystemClock.uptimeMillis(); 865 while (timeout > 0) { 866 if (Debug.isDebuggerConnected()) { 867 debuggerWasConnected = 2; 868 } 869 try { 870 mLock.wait(timeout); 871 // Note: mHandlerCheckers and mMonitorChecker may have changed after waiting 872 } catch (InterruptedException e) { 873 Log.wtf(TAG, e); 874 } 875 if (Debug.isDebuggerConnected()) { 876 debuggerWasConnected = 2; 877 } 878 timeout = checkIntervalMillis - (SystemClock.uptimeMillis() - start); 879 } 880 881 final int waitState = evaluateCheckerCompletionLocked(); 882 if (waitState == COMPLETED) { 883 // The monitors have returned; reset 884 waitedHalf = false; 885 continue; 886 } else if (waitState == WAITING) { 887 // still waiting but within their configured intervals; back off and recheck 888 continue; 889 } else if (waitState == WAITED_UNTIL_PRE_WATCHDOG) { 890 if (!waitedHalf) { 891 Slog.i(TAG, "WAITED_UNTIL_PRE_WATCHDOG"); 892 waitedHalf = true; 893 // We've waited until the pre-watchdog, but we'd need to do the stack trace 894 // dump w/o the lock. 895 blockedCheckers = getCheckersWithStateLocked(WAITED_UNTIL_PRE_WATCHDOG); 896 subject = describeCheckersLocked(blockedCheckers); 897 pids = new ArrayList<>(mInterestingJavaPids); 898 doWaitedPreDump = true; 899 } else { 900 continue; 901 } 902 } else { 903 // something is overdue! 904 blockedCheckers = getCheckersWithStateLocked(OVERDUE); 905 subject = describeCheckersLocked(blockedCheckers); 906 allowRestart = mAllowRestart; 907 pids = new ArrayList<>(mInterestingJavaPids); 908 } 909 } // END synchronized (mLock) 910 911 // If we got here, that means that the system is most likely hung. 912 // 913 // First collect stack traces from all threads of the system process. 914 // 915 // Then, if we reached the full timeout, kill this process so that the system will 916 // restart. If we reached pre-watchdog timeout, just log some information and continue. 917 logWatchog(doWaitedPreDump, subject, pids); 918 919 if (doWaitedPreDump) { 920 // We have waited for only pre-watchdog timeout, we continue to wait for the 921 // duration of the full timeout before killing the process. 922 continue; 923 } 924 925 IActivityController controller; 926 synchronized (mLock) { 927 controller = mController; 928 } 929 if (controller != null) { 930 Slog.i(TAG, "Reporting stuck state to activity controller"); 931 try { 932 Binder.setDumpDisabled("Service dumps disabled due to hung system process."); 933 // 1 = keep waiting, -1 = kill system 934 int res = controller.systemNotResponding(subject); 935 if (res >= 0) { 936 Slog.i(TAG, "Activity controller requested to coninue to wait"); 937 waitedHalf = false; 938 continue; 939 } 940 } catch (RemoteException e) { 941 } 942 } 943 944 // Only kill the process if the debugger is not attached. 945 if (Debug.isDebuggerConnected()) { 946 debuggerWasConnected = 2; 947 } 948 if (debuggerWasConnected >= 2) { 949 Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process"); 950 } else if (debuggerWasConnected > 0) { 951 Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process"); 952 } else if (!allowRestart) { 953 Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process"); 954 } else { 955 Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject); 956 WatchdogDiagnostics.diagnoseCheckers(blockedCheckers); 957 Slog.w(TAG, "*** GOODBYE!"); 958 if (!Build.IS_USER && isCrashLoopFound() 959 && !WatchdogProperties.should_ignore_fatal_count().orElse(false)) { 960 breakCrashLoop(); 961 } 962 Process.killProcess(Process.myPid()); 963 System.exit(10); 964 } 965 966 waitedHalf = false; 967 } 968 } 969 logWatchog(boolean preWatchdog, String subject, ArrayList<Integer> pids)970 private void logWatchog(boolean preWatchdog, String subject, ArrayList<Integer> pids) { 971 // Get critical event log before logging the pre-watchdog so that it doesn't 972 // occur in the log. 973 String criticalEvents = 974 CriticalEventLog.getInstance().logLinesForSystemServerTraceFile(); 975 final UUID errorId = mTraceErrorLogger.generateErrorId(); 976 if (mTraceErrorLogger.isAddErrorIdEnabled()) { 977 mTraceErrorLogger.addProcessInfoAndErrorIdToTrace("system_server", Process.myPid(), 978 errorId); 979 mTraceErrorLogger.addSubjectToTrace(subject, errorId); 980 } 981 982 final String dropboxTag; 983 if (preWatchdog) { 984 dropboxTag = "pre_watchdog"; 985 CriticalEventLog.getInstance().logHalfWatchdog(subject); 986 FrameworkStatsLog.write(FrameworkStatsLog.SYSTEM_SERVER_PRE_WATCHDOG_OCCURRED); 987 } else { 988 dropboxTag = "watchdog"; 989 CriticalEventLog.getInstance().logWatchdog(subject, errorId); 990 EventLog.writeEvent(EventLogTags.WATCHDOG, subject); 991 // Log the atom as early as possible since it is used as a mechanism to trigger 992 // Perfetto. Ideally, the Perfetto trace capture should happen as close to the 993 // point in time when the Watchdog happens as possible. 994 FrameworkStatsLog.write(FrameworkStatsLog.SYSTEM_SERVER_WATCHDOG_OCCURRED, subject); 995 } 996 997 final LinkedHashMap headersMap = 998 com.android.server.am.Flags.enableDropboxWatchdogHeaders() 999 ? new LinkedHashMap<>(Collections.singletonMap("Watchdog-Type", dropboxTag)) : null; 1000 long anrTime = SystemClock.uptimeMillis(); 1001 StringBuilder report = new StringBuilder(); 1002 report.append(ResourcePressureUtil.currentPsiState()); 1003 ProcessCpuTracker processCpuTracker = new ProcessCpuTracker(false); 1004 StringWriter tracesFileException = new StringWriter(); 1005 final File stack = StackTracesDumpHelper.dumpStackTraces( 1006 pids, processCpuTracker, new SparseBooleanArray(), 1007 CompletableFuture.completedFuture(getInterestingNativePids()), 1008 tracesFileException, subject, criticalEvents, headersMap, 1009 Runnable::run, /* latencyTracker= */null); 1010 // Give some extra time to make sure the stack traces get written. 1011 // The system's been hanging for a whlie, another second or two won't hurt much. 1012 SystemClock.sleep(5000); 1013 processCpuTracker.update(); 1014 report.append(processCpuTracker.printCurrentState(anrTime, 10)); 1015 report.append(tracesFileException.getBuffer()); 1016 1017 if (!preWatchdog) { 1018 // Trigger the kernel to dump all blocked threads, and backtraces on all CPUs to the 1019 // kernel log 1020 doSysRq('w'); 1021 doSysRq('m'); 1022 doSysRq('l'); 1023 } 1024 1025 // Try to add the error to the dropbox, but assuming that the ActivityManager 1026 // itself may be deadlocked. (which has happened, causing this statement to 1027 // deadlock and the watchdog as a whole to be ineffective) 1028 Thread dropboxThread = new Thread("watchdogWriteToDropbox") { 1029 public void run() { 1030 // If a watched thread hangs before init() is called, we don't have a 1031 // valid mActivity. So we can't log the error to dropbox. 1032 if (mActivity != null) { 1033 mActivity.addErrorToDropBox( 1034 dropboxTag, null, "system_server", null, null, null, 1035 null, report.toString(), stack, null, null, null, 1036 errorId, null); 1037 } 1038 } 1039 }; 1040 dropboxThread.start(); 1041 try { 1042 dropboxThread.join(2000); // wait up to 2 seconds for it to return. 1043 } catch (InterruptedException ignored) { } 1044 } 1045 doSysRq(char c)1046 private void doSysRq(char c) { 1047 try { 1048 FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger"); 1049 sysrq_trigger.write(c); 1050 sysrq_trigger.close(); 1051 } catch (IOException e) { 1052 Slog.w(TAG, "Failed to write to /proc/sysrq-trigger", e); 1053 } 1054 } 1055 resetTimeoutHistory()1056 private void resetTimeoutHistory() { 1057 writeTimeoutHistory(new ArrayList<String>()); 1058 } 1059 writeTimeoutHistory(Iterable<String> crashHistory)1060 private void writeTimeoutHistory(Iterable<String> crashHistory) { 1061 String data = String.join(",", crashHistory); 1062 1063 try (FileWriter writer = new FileWriter(TIMEOUT_HISTORY_FILE)) { 1064 writer.write(SystemProperties.get("ro.boottime.zygote")); 1065 writer.write(":"); 1066 writer.write(data); 1067 } catch (IOException e) { 1068 Slog.e(TAG, "Failed to write file " + TIMEOUT_HISTORY_FILE, e); 1069 } 1070 } 1071 readTimeoutHistory()1072 private String[] readTimeoutHistory() { 1073 final String[] emptyStringArray = {}; 1074 1075 try (BufferedReader reader = new BufferedReader(new FileReader(TIMEOUT_HISTORY_FILE))) { 1076 String line = reader.readLine(); 1077 if (line == null) { 1078 return emptyStringArray; 1079 } 1080 1081 String[] data = line.trim().split(":"); 1082 String boottime = data.length >= 1 ? data[0] : ""; 1083 String history = data.length >= 2 ? data[1] : ""; 1084 if (SystemProperties.get("ro.boottime.zygote").equals(boottime) && !history.isEmpty()) { 1085 return history.split(","); 1086 } else { 1087 return emptyStringArray; 1088 } 1089 } catch (FileNotFoundException e) { 1090 return emptyStringArray; 1091 } catch (IOException e) { 1092 Slog.e(TAG, "Failed to read file " + TIMEOUT_HISTORY_FILE, e); 1093 return emptyStringArray; 1094 } 1095 } 1096 hasActiveUsbConnection()1097 private boolean hasActiveUsbConnection() { 1098 try { 1099 final String state = FileUtils.readTextFile( 1100 new File("/sys/class/android_usb/android0/state"), 1101 128 /*max*/, null /*ellipsis*/).trim(); 1102 if ("CONFIGURED".equals(state)) { 1103 return true; 1104 } 1105 } catch (IOException e) { 1106 Slog.w(TAG, "Failed to determine if device was on USB", e); 1107 } 1108 return false; 1109 } 1110 isCrashLoopFound()1111 private boolean isCrashLoopFound() { 1112 int fatalCount = WatchdogProperties.fatal_count().orElse(0); 1113 long fatalWindowMs = TimeUnit.SECONDS.toMillis( 1114 WatchdogProperties.fatal_window_seconds().orElse(0)); 1115 if (fatalCount == 0 || fatalWindowMs == 0) { 1116 if (fatalCount != fatalWindowMs) { 1117 Slog.w(TAG, String.format("sysprops '%s' and '%s' should be set or unset together", 1118 PROP_FATAL_LOOP_COUNT, PROP_FATAL_LOOP_WINDOWS_SECS)); 1119 } 1120 return false; 1121 } 1122 1123 // new-history = [last (fatalCount - 1) items in old-history] + [nowMs]. 1124 long nowMs = SystemClock.elapsedRealtime(); // Time since boot including deep sleep. 1125 String[] rawCrashHistory = readTimeoutHistory(); 1126 ArrayList<String> crashHistory = new ArrayList<String>(Arrays.asList(Arrays.copyOfRange( 1127 rawCrashHistory, 1128 Math.max(0, rawCrashHistory.length - fatalCount - 1), 1129 rawCrashHistory.length))); 1130 // Something wrong here. 1131 crashHistory.add(String.valueOf(nowMs)); 1132 writeTimeoutHistory(crashHistory); 1133 1134 // Returns false if the device has an active USB connection. 1135 if (hasActiveUsbConnection()) { 1136 return false; 1137 } 1138 1139 long firstCrashMs; 1140 try { 1141 firstCrashMs = Long.parseLong(crashHistory.get(0)); 1142 } catch (NumberFormatException t) { 1143 Slog.w(TAG, "Failed to parseLong " + crashHistory.get(0), t); 1144 resetTimeoutHistory(); 1145 return false; 1146 } 1147 return crashHistory.size() >= fatalCount && nowMs - firstCrashMs < fatalWindowMs; 1148 } 1149 breakCrashLoop()1150 private void breakCrashLoop() { 1151 try (FileWriter kmsg = new FileWriter("/dev/kmsg_debug", /* append= */ true)) { 1152 kmsg.append("Fatal reset to escape the system_server crashing loop\n"); 1153 } catch (IOException e) { 1154 Slog.w(TAG, "Failed to append to kmsg", e); 1155 } 1156 doSysRq('c'); 1157 } 1158 1159 @Override dump(@onNull PrintWriter pw, @Nullable String[] args)1160 public void dump(@NonNull PrintWriter pw, @Nullable String[] args) { 1161 pw.print("WatchdogTimeoutMillis="); 1162 pw.println(mWatchdogTimeoutMillis); 1163 } 1164 } 1165