• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.android.server;
18 
19 import static android.content.Intent.ACTION_REBOOT;
20 import static android.content.Intent.ACTION_SHUTDOWN;
21 import static android.service.watchdog.ExplicitHealthCheckService.PackageConfig;
22 import static android.util.Xml.Encoding.UTF_8;
23 
24 import static com.android.server.crashrecovery.CrashRecoveryUtils.dumpCrashRecoveryEvents;
25 
26 import static java.lang.annotation.RetentionPolicy.SOURCE;
27 
28 import android.annotation.CallbackExecutor;
29 import android.annotation.FlaggedApi;
30 import android.annotation.IntDef;
31 import android.annotation.NonNull;
32 import android.annotation.Nullable;
33 import android.annotation.SuppressLint;
34 import android.annotation.SystemApi;
35 import android.content.BroadcastReceiver;
36 import android.content.Context;
37 import android.content.Intent;
38 import android.content.IntentFilter;
39 import android.content.pm.PackageInfo;
40 import android.content.pm.PackageManager;
41 import android.content.pm.VersionedPackage;
42 import android.crashrecovery.flags.Flags;
43 import android.os.Environment;
44 import android.os.Handler;
45 import android.os.Looper;
46 import android.os.Process;
47 import android.os.SystemProperties;
48 import android.provider.DeviceConfig;
49 import android.sysprop.CrashRecoveryProperties;
50 import android.text.TextUtils;
51 import android.util.ArrayMap;
52 import android.util.ArraySet;
53 import android.util.AtomicFile;
54 import android.util.EventLog;
55 import android.util.IndentingPrintWriter;
56 import android.util.LongArrayQueue;
57 import android.util.Slog;
58 import android.util.Xml;
59 import android.util.XmlUtils;
60 
61 import com.android.internal.annotations.GuardedBy;
62 import com.android.internal.annotations.VisibleForTesting;
63 import com.android.internal.util.FastXmlSerializer;
64 import com.android.modules.utils.BackgroundThread;
65 
66 import libcore.io.IoUtils;
67 
68 import org.xmlpull.v1.XmlPullParser;
69 import org.xmlpull.v1.XmlPullParserException;
70 import org.xmlpull.v1.XmlSerializer;
71 
72 import java.io.BufferedReader;
73 import java.io.BufferedWriter;
74 import java.io.File;
75 import java.io.FileInputStream;
76 import java.io.FileNotFoundException;
77 import java.io.FileOutputStream;
78 import java.io.FileReader;
79 import java.io.FileWriter;
80 import java.io.IOException;
81 import java.io.InputStream;
82 import java.io.ObjectInputStream;
83 import java.io.ObjectOutputStream;
84 import java.io.PrintWriter;
85 import java.lang.annotation.Retention;
86 import java.lang.annotation.RetentionPolicy;
87 import java.util.ArrayList;
88 import java.util.Collections;
89 import java.util.HashMap;
90 import java.util.Iterator;
91 import java.util.List;
92 import java.util.Map;
93 import java.util.NoSuchElementException;
94 import java.util.Set;
95 import java.util.concurrent.Executor;
96 import java.util.concurrent.TimeUnit;
97 
98 /**
99  * Monitors the health of packages on the system and notifies interested observers when packages
100  * fail. On failure, the registered observer with the least user impacting mitigation will
101  * be notified.
102  * @hide
103  */
104 @FlaggedApi(Flags.FLAG_ENABLE_CRASHRECOVERY)
105 @SystemApi(client = SystemApi.Client.SYSTEM_SERVER)
106 public class PackageWatchdog {
107     private static final String TAG = "PackageWatchdog";
108 
109     static final String PROPERTY_WATCHDOG_TRIGGER_DURATION_MILLIS =
110             "watchdog_trigger_failure_duration_millis";
111     static final String PROPERTY_WATCHDOG_TRIGGER_FAILURE_COUNT =
112             "watchdog_trigger_failure_count";
113     static final String PROPERTY_WATCHDOG_EXPLICIT_HEALTH_CHECK_ENABLED =
114             "watchdog_explicit_health_check_enabled";
115 
116     // TODO: make the following values configurable via DeviceConfig
117     private static final long NATIVE_CRASH_POLLING_INTERVAL_MILLIS =
118             TimeUnit.SECONDS.toMillis(30);
119     private static final long NUMBER_OF_NATIVE_CRASH_POLLS = 10;
120 
121 
122     /** Reason for package failure could not be determined. */
123     public static final int FAILURE_REASON_UNKNOWN = 0;
124 
125     /** The package had a native crash. */
126     public static final int FAILURE_REASON_NATIVE_CRASH = 1;
127 
128     /** The package failed an explicit health check. */
129     public static final int FAILURE_REASON_EXPLICIT_HEALTH_CHECK = 2;
130 
131     /** The app crashed. */
132     public static final int FAILURE_REASON_APP_CRASH = 3;
133 
134     /** The app was not responding. */
135     public static final int FAILURE_REASON_APP_NOT_RESPONDING = 4;
136 
137     /** The device was boot looping. */
138     public static final int FAILURE_REASON_BOOT_LOOP = 5;
139 
140     /** @hide */
141     @IntDef(prefix = { "FAILURE_REASON_" }, value = {
142             FAILURE_REASON_UNKNOWN,
143             FAILURE_REASON_NATIVE_CRASH,
144             FAILURE_REASON_EXPLICIT_HEALTH_CHECK,
145             FAILURE_REASON_APP_CRASH,
146             FAILURE_REASON_APP_NOT_RESPONDING,
147             FAILURE_REASON_BOOT_LOOP
148     })
149     @Retention(RetentionPolicy.SOURCE)
150     public @interface FailureReasons {}
151 
152     // Duration to count package failures before it resets to 0
153     @VisibleForTesting
154     static final int DEFAULT_TRIGGER_FAILURE_DURATION_MS =
155             (int) TimeUnit.MINUTES.toMillis(1);
156     // Number of package failures within the duration above before we notify observers
157     @VisibleForTesting
158     static final int DEFAULT_TRIGGER_FAILURE_COUNT = 5;
159     @VisibleForTesting
160     static final long DEFAULT_OBSERVING_DURATION_MS = TimeUnit.DAYS.toMillis(2);
161     // Sliding window for tracking how many mitigation calls were made for a package.
162     @VisibleForTesting
163     static final long DEFAULT_DEESCALATION_WINDOW_MS = TimeUnit.HOURS.toMillis(1);
164     // Whether explicit health checks are enabled or not
165     private static final boolean DEFAULT_EXPLICIT_HEALTH_CHECK_ENABLED = true;
166 
167     @VisibleForTesting
168     static final int DEFAULT_BOOT_LOOP_TRIGGER_COUNT = 5;
169 
170     static final long DEFAULT_BOOT_LOOP_TRIGGER_WINDOW_MS = TimeUnit.MINUTES.toMillis(10);
171 
172     // Time needed to apply mitigation
173     private static final String MITIGATION_WINDOW_MS =
174             "persist.device_config.configuration.mitigation_window_ms";
175     @VisibleForTesting
176     static final long DEFAULT_MITIGATION_WINDOW_MS = TimeUnit.SECONDS.toMillis(5);
177 
178     // Threshold level at which or above user might experience significant disruption.
179     private static final String MAJOR_USER_IMPACT_LEVEL_THRESHOLD =
180             "persist.device_config.configuration.major_user_impact_level_threshold";
181     private static final int DEFAULT_MAJOR_USER_IMPACT_LEVEL_THRESHOLD =
182             PackageHealthObserverImpact.USER_IMPACT_LEVEL_71;
183 
184     // Comma separated list of all packages exempt from user impact level threshold. If a package
185     // in the list is crash looping, all the mitigations including factory reset will be performed.
186     private static final String PACKAGES_EXEMPT_FROM_IMPACT_LEVEL_THRESHOLD =
187             "persist.device_config.configuration.packages_exempt_from_impact_level_threshold";
188 
189     // Comma separated list of default packages exempt from user impact level threshold.
190     private static final String DEFAULT_PACKAGES_EXEMPT_FROM_IMPACT_LEVEL_THRESHOLD =
191             "com.android.systemui";
192 
193     private long mNumberOfNativeCrashPollsRemaining;
194 
195     private static final int DB_VERSION = 1;
196     private static final String TAG_PACKAGE_WATCHDOG = "package-watchdog";
197     private static final String TAG_PACKAGE = "package";
198     private static final String TAG_OBSERVER = "observer";
199     private static final String ATTR_VERSION = "version";
200     private static final String ATTR_NAME = "name";
201     private static final String ATTR_DURATION = "duration";
202     private static final String ATTR_EXPLICIT_HEALTH_CHECK_DURATION = "health-check-duration";
203     private static final String ATTR_PASSED_HEALTH_CHECK = "passed-health-check";
204     private static final String ATTR_MITIGATION_CALLS = "mitigation-calls";
205     private static final String ATTR_MITIGATION_COUNT = "mitigation-count";
206 
207     // A file containing information about the current mitigation count in the case of a boot loop.
208     // This allows boot loop information to persist in the case of an fs-checkpoint being
209     // aborted.
210     private static final String METADATA_FILE = "/metadata/watchdog/mitigation_count.txt";
211 
212     /**
213      * EventLog tags used when logging into the event log. Note the values must be sync with
214      * frameworks/base/services/core/java/com/android/server/EventLogTags.logtags to get correct
215      * name translation.
216      */
217     private static final int LOG_TAG_RESCUE_NOTE = 2900;
218 
219     private static final Object sPackageWatchdogLock = new Object();
220     @GuardedBy("sPackageWatchdogLock")
221     private static PackageWatchdog sPackageWatchdog;
222 
223     private static final Object sLock = new Object();
224     // System server context
225     private final Context mContext;
226     // Handler to run short running tasks
227     private final Handler mShortTaskHandler;
228     // Handler for processing IO and long running tasks
229     private final Handler mLongTaskHandler;
230     // Contains (observer-name -> observer-handle) that have ever been registered from
231     // previous boots. Observers with all packages expired are periodically pruned.
232     // It is saved to disk on system shutdown and repouplated on startup so it survives reboots.
233     @GuardedBy("sLock")
234     private final ArrayMap<String, ObserverInternal> mAllObservers = new ArrayMap<>();
235     // File containing the XML data of monitored packages /data/system/package-watchdog.xml
236     private final AtomicFile mPolicyFile;
237     private final ExplicitHealthCheckController mHealthCheckController;
238     private final Runnable mSyncRequests = this::syncRequests;
239     private final Runnable mSyncStateWithScheduledReason = this::syncStateWithScheduledReason;
240     private final Runnable mSaveToFile = this::saveToFile;
241     private final SystemClock mSystemClock;
242     private final BootThreshold mBootThreshold;
243     private final DeviceConfig.OnPropertiesChangedListener
244             mOnPropertyChangedListener = this::onPropertyChanged;
245 
246     private final Set<String> mPackagesExemptFromImpactLevelThreshold = new ArraySet<>();
247 
248     // The set of packages that have been synced with the ExplicitHealthCheckController
249     @GuardedBy("sLock")
250     private Set<String> mRequestedHealthCheckPackages = new ArraySet<>();
251     @GuardedBy("sLock")
252     private boolean mIsPackagesReady;
253     // Flag to control whether explicit health checks are supported or not
254     @GuardedBy("sLock")
255     private boolean mIsHealthCheckEnabled = DEFAULT_EXPLICIT_HEALTH_CHECK_ENABLED;
256     @GuardedBy("sLock")
257     private int mTriggerFailureDurationMs = DEFAULT_TRIGGER_FAILURE_DURATION_MS;
258     @GuardedBy("sLock")
259     private int mTriggerFailureCount = DEFAULT_TRIGGER_FAILURE_COUNT;
260     // SystemClock#uptimeMillis when we last executed #syncState
261     // 0 if no prune is scheduled.
262     @GuardedBy("sLock")
263     private long mUptimeAtLastStateSync;
264     // If true, sync explicit health check packages with the ExplicitHealthCheckController.
265     @GuardedBy("sLock")
266     private boolean mSyncRequired = false;
267 
268     @GuardedBy("sLock")
269     private long mLastMitigation = -1000000;
270 
271     @FunctionalInterface
272     @VisibleForTesting
273     interface SystemClock {
uptimeMillis()274         long uptimeMillis();
275     }
276 
PackageWatchdog(Context context)277     private PackageWatchdog(Context context) {
278         // Needs to be constructed inline
279         this(context, new AtomicFile(
280                         new File(new File(Environment.getDataDirectory(), "system"),
281                                 "package-watchdog.xml")),
282                 new Handler(Looper.myLooper()), BackgroundThread.getHandler(),
283                 new ExplicitHealthCheckController(context),
284                 android.os.SystemClock::uptimeMillis);
285     }
286 
287     /**
288      * Creates a PackageWatchdog that allows injecting dependencies.
289      */
290     @VisibleForTesting
PackageWatchdog(Context context, AtomicFile policyFile, Handler shortTaskHandler, Handler longTaskHandler, ExplicitHealthCheckController controller, SystemClock clock)291     PackageWatchdog(Context context, AtomicFile policyFile, Handler shortTaskHandler,
292             Handler longTaskHandler, ExplicitHealthCheckController controller,
293             SystemClock clock) {
294         mContext = context;
295         mPolicyFile = policyFile;
296         mShortTaskHandler = shortTaskHandler;
297         mLongTaskHandler = longTaskHandler;
298         mHealthCheckController = controller;
299         mSystemClock = clock;
300         mNumberOfNativeCrashPollsRemaining = NUMBER_OF_NATIVE_CRASH_POLLS;
301         mBootThreshold = new BootThreshold(DEFAULT_BOOT_LOOP_TRIGGER_COUNT,
302                 DEFAULT_BOOT_LOOP_TRIGGER_WINDOW_MS);
303 
304         loadFromFile();
305         sPackageWatchdog = this;
306     }
307 
308     /**
309      * Creates or gets singleton instance of PackageWatchdog.
310      *
311      * @param context The system server context.
312      */
getInstance(@onNull Context context)313     public static  @NonNull PackageWatchdog getInstance(@NonNull Context context) {
314         synchronized (sPackageWatchdogLock) {
315             if (sPackageWatchdog == null) {
316                 new PackageWatchdog(context);
317             }
318             return sPackageWatchdog;
319         }
320     }
321 
322     /**
323      * Called during boot to notify when packages are ready on the device so we can start
324      * binding.
325      * @hide
326      */
onPackagesReady()327     public void onPackagesReady() {
328         synchronized (sLock) {
329             mIsPackagesReady = true;
330             mHealthCheckController.setCallbacks(packageName -> onHealthCheckPassed(packageName),
331                     packages -> onSupportedPackages(packages),
332                     this::onSyncRequestNotified);
333             setPropertyChangedListenerLocked();
334             updateConfigs();
335         }
336     }
337 
338     /**
339      * Registers {@code observer} to listen for package failures. Add a new ObserverInternal for
340      * this observer if it does not already exist.
341      * For executing mitigations observers will receive callback on the given executor.
342      *
343      * <p>Observers are expected to call this on boot. It does not specify any packages but
344      * it will resume observing any packages requested from a previous boot.
345      *
346      * @param observer instance of {@link PackageHealthObserver} for observing package failures
347      *                 and boot loops.
348      * @param executor Executor for the thread on which observers would receive callbacks
349      */
registerHealthObserver(@onNull @allbackExecutor Executor executor, @NonNull PackageHealthObserver observer)350     public void registerHealthObserver(@NonNull @CallbackExecutor Executor executor,
351             @NonNull PackageHealthObserver observer) {
352         synchronized (sLock) {
353             ObserverInternal internalObserver = mAllObservers.get(observer.getUniqueIdentifier());
354             if (internalObserver != null) {
355                 internalObserver.registeredObserver = observer;
356                 internalObserver.observerExecutor = executor;
357             } else {
358                 internalObserver = new ObserverInternal(observer.getUniqueIdentifier(),
359                         new ArrayList<>());
360                 internalObserver.registeredObserver = observer;
361                 internalObserver.observerExecutor = executor;
362                 mAllObservers.put(observer.getUniqueIdentifier(), internalObserver);
363                 syncState("added new observer");
364             }
365         }
366     }
367 
368     /**
369      * Starts observing the health of the {@code packages} for {@code observer}.
370      * Note: Observer needs to be registered with {@link #registerHealthObserver} before calling
371      * this API.
372      *
373      * <p>If monitoring a package supporting explicit health check, at the end of the monitoring
374      * duration if {@link #onHealthCheckPassed} was never called,
375      * {@link PackageHealthObserver#onExecuteHealthCheckMitigation} will be called as if the
376      * package failed.
377      *
378      * <p>If {@code observer} is already monitoring a package in {@code packageNames},
379      * the monitoring window of that package will be reset to {@code durationMs} and the health
380      * check state will be reset to a default.
381      *
382      * <p>The {@code observer} must be registered with {@link #registerHealthObserver} before
383      * calling this method.
384      *
385      * @param packageNames The list of packages to check. If this is empty, the call will be a
386      *                     no-op.
387      *
388      * @param timeoutMs The timeout after which Explicit Health Checks would not run. If this is
389      *                  less than 1, a default monitoring duration 2 days will be used.
390      *
391      * @throws IllegalStateException if the observer was not previously registered
392      */
startExplicitHealthCheck(@onNull List<String> packageNames, long timeoutMs, @NonNull PackageHealthObserver observer)393     public void startExplicitHealthCheck(@NonNull List<String> packageNames, long timeoutMs,
394             @NonNull PackageHealthObserver observer) {
395         synchronized (sLock) {
396             if (!mAllObservers.containsKey(observer.getUniqueIdentifier())) {
397                 Slog.wtf(TAG, "No observer found, need to register the observer: "
398                         + observer.getUniqueIdentifier());
399                 throw new IllegalStateException("Observer not registered");
400             }
401         }
402         if (packageNames.isEmpty()) {
403             Slog.wtf(TAG, "No packages to observe, " + observer.getUniqueIdentifier());
404             return;
405         }
406         if (timeoutMs < 1) {
407             Slog.wtf(TAG, "Invalid duration " + timeoutMs + "ms for observer "
408                     + observer.getUniqueIdentifier() + ". Not observing packages " + packageNames);
409             timeoutMs = DEFAULT_OBSERVING_DURATION_MS;
410         }
411 
412         List<MonitoredPackage> packages = new ArrayList<>();
413         for (int i = 0; i < packageNames.size(); i++) {
414             // Health checks not available yet so health check state will start INACTIVE
415             MonitoredPackage pkg = newMonitoredPackage(packageNames.get(i), timeoutMs, false);
416             if (pkg != null) {
417                 packages.add(pkg);
418             } else {
419                 Slog.w(TAG, "Failed to create MonitoredPackage for pkg=" + packageNames.get(i));
420             }
421         }
422 
423         if (packages.isEmpty()) {
424             return;
425         }
426 
427         // Sync before we add the new packages to the observers. This will #pruneObservers,
428         // causing any elapsed time to be deducted from all existing packages before we add new
429         // packages. This maintains the invariant that the elapsed time for ALL (new and existing)
430         // packages is the same.
431         mLongTaskHandler.post(() -> {
432             syncState("observing new packages");
433 
434             synchronized (sLock) {
435                 ObserverInternal oldObserver = mAllObservers.get(observer.getUniqueIdentifier());
436                 if (oldObserver == null) {
437                     Slog.d(TAG, observer.getUniqueIdentifier() + " started monitoring health "
438                             + "of packages " + packageNames);
439                     mAllObservers.put(observer.getUniqueIdentifier(),
440                             new ObserverInternal(observer.getUniqueIdentifier(), packages));
441                 } else {
442                     Slog.d(TAG, observer.getUniqueIdentifier() + " added the following "
443                             + "packages to monitor " + packageNames);
444                     oldObserver.updatePackagesLocked(packages);
445                 }
446             }
447 
448             // Sync after we add the new packages to the observers. We may have received packges
449             // requiring an earlier schedule than we are currently scheduled for.
450             syncState("updated observers");
451         });
452 
453     }
454 
455     /**
456      * Unregisters {@code observer} from listening to package failure.
457      * Additionally, this stops observing any packages that may have previously been observed
458      * even from a previous boot.
459      */
unregisterHealthObserver(@onNull PackageHealthObserver observer)460     public void unregisterHealthObserver(@NonNull PackageHealthObserver observer) {
461         mLongTaskHandler.post(() -> {
462             synchronized (sLock) {
463                 mAllObservers.remove(observer.getUniqueIdentifier());
464             }
465             syncState("unregistering observer: " + observer.getUniqueIdentifier());
466         });
467     }
468 
469     /**
470      * Called when a process fails due to a crash, ANR or explicit health check.
471      *
472      * <p>For each package contained in the process, one registered observer with the least user
473      * impact will be notified for mitigation.
474      *
475      * <p>This method could be called frequently if there is a severe problem on the device.
476      */
notifyPackageFailure(@onNull List<VersionedPackage> packages, @FailureReasons int failureReason)477     public void notifyPackageFailure(@NonNull List<VersionedPackage> packages,
478             @FailureReasons int failureReason) {
479         if (packages == null) {
480             Slog.w(TAG, "Could not resolve a list of failing packages");
481             return;
482         }
483         synchronized (sLock) {
484             final long now = mSystemClock.uptimeMillis();
485             if (now >= mLastMitigation
486                     && (now - mLastMitigation) < getMitigationWindowMs()) {
487                 Slog.i(TAG, "Skipping notifyPackageFailure mitigation");
488                 return;
489             }
490         }
491         mLongTaskHandler.post(() -> {
492             synchronized (sLock) {
493                 if (mAllObservers.isEmpty()) {
494                     return;
495                 }
496                 boolean requiresImmediateAction = (failureReason == FAILURE_REASON_NATIVE_CRASH
497                         || failureReason == FAILURE_REASON_EXPLICIT_HEALTH_CHECK);
498                 if (requiresImmediateAction) {
499                     handleFailureImmediately(packages, failureReason);
500                 } else {
501                     for (int pIndex = 0; pIndex < packages.size(); pIndex++) {
502                         VersionedPackage versionedPackage = packages.get(pIndex);
503                         // Observer that will receive failure for versionedPackage
504                         ObserverInternal currentObserverToNotify = null;
505                         int currentObserverImpact = Integer.MAX_VALUE;
506                         MonitoredPackage currentMonitoredPackage = null;
507 
508                         // Find observer with least user impact
509                         for (int oIndex = 0; oIndex < mAllObservers.size(); oIndex++) {
510                             ObserverInternal observer = mAllObservers.valueAt(oIndex);
511                             PackageHealthObserver registeredObserver = observer.registeredObserver;
512                             if (registeredObserver != null
513                                     && observer.notifyPackageFailureLocked(
514                                     versionedPackage.getPackageName())) {
515                                 MonitoredPackage p = observer.getMonitoredPackage(
516                                         versionedPackage.getPackageName());
517                                 int mitigationCount = 1;
518                                 if (p != null) {
519                                     mitigationCount = p.getMitigationCountLocked() + 1;
520                                 }
521                                 int impact = registeredObserver.onHealthCheckFailed(
522                                         versionedPackage, failureReason, mitigationCount);
523                                 if (impact != PackageHealthObserverImpact.USER_IMPACT_LEVEL_0
524                                         && impact < currentObserverImpact) {
525                                     currentObserverToNotify = observer;
526                                     currentObserverImpact = impact;
527                                     currentMonitoredPackage = p;
528                                 }
529                             }
530                         }
531 
532                         // Execute action with least user impact
533                         if (currentObserverToNotify != null) {
534                             int mitigationCount;
535                             if (currentMonitoredPackage != null) {
536                                 currentMonitoredPackage.noteMitigationCallLocked();
537                                 mitigationCount =
538                                         currentMonitoredPackage.getMitigationCountLocked();
539                             } else {
540                                 mitigationCount = 1;
541                             }
542                             maybeExecute(currentObserverToNotify, versionedPackage,
543                                     failureReason, currentObserverImpact, mitigationCount);
544                         }
545                     }
546                 }
547             }
548         });
549     }
550 
551     /**
552      * For native crashes or explicit health check failures, call directly into each observer to
553      * mitigate the error without going through failure threshold logic.
554      */
555     @GuardedBy("sLock")
handleFailureImmediately(List<VersionedPackage> packages, @FailureReasons int failureReason)556     private void handleFailureImmediately(List<VersionedPackage> packages,
557             @FailureReasons int failureReason) {
558         VersionedPackage failingPackage = packages.size() > 0 ? packages.get(0) : null;
559         ObserverInternal currentObserverToNotify = null;
560         int currentObserverImpact = Integer.MAX_VALUE;
561         for (ObserverInternal observer: mAllObservers.values()) {
562             PackageHealthObserver registeredObserver = observer.registeredObserver;
563             if (registeredObserver != null) {
564                 int impact = registeredObserver.onHealthCheckFailed(
565                         failingPackage, failureReason, 1);
566                 if (impact != PackageHealthObserverImpact.USER_IMPACT_LEVEL_0
567                         && impact < currentObserverImpact) {
568                     currentObserverToNotify = observer;
569                     currentObserverImpact = impact;
570                 }
571             }
572         }
573         if (currentObserverToNotify != null) {
574             maybeExecute(currentObserverToNotify, failingPackage, failureReason,
575                     currentObserverImpact, /*mitigationCount=*/ 1);
576         }
577     }
578 
maybeExecute(ObserverInternal currentObserverToNotify, VersionedPackage versionedPackage, @FailureReasons int failureReason, int currentObserverImpact, int mitigationCount)579     private void maybeExecute(ObserverInternal currentObserverToNotify,
580                               VersionedPackage versionedPackage,
581                               @FailureReasons int failureReason,
582                               int currentObserverImpact,
583                               int mitigationCount) {
584         if (allowMitigations(currentObserverImpact, versionedPackage)) {
585             PackageHealthObserver registeredObserver;
586             synchronized (sLock) {
587                 mLastMitigation = mSystemClock.uptimeMillis();
588                 registeredObserver = currentObserverToNotify.registeredObserver;
589             }
590             currentObserverToNotify.observerExecutor.execute(() ->
591                     registeredObserver.onExecuteHealthCheckMitigation(versionedPackage,
592                             failureReason, mitigationCount));
593         }
594     }
595 
allowMitigations(int currentObserverImpact, VersionedPackage versionedPackage)596     private boolean allowMitigations(int currentObserverImpact,
597             VersionedPackage versionedPackage) {
598         return currentObserverImpact < getUserImpactLevelLimit()
599                 || getPackagesExemptFromImpactLevelThreshold().contains(
600                 versionedPackage.getPackageName());
601     }
602 
getMitigationWindowMs()603     private long getMitigationWindowMs() {
604         return SystemProperties.getLong(MITIGATION_WINDOW_MS, DEFAULT_MITIGATION_WINDOW_MS);
605     }
606 
607 
608     /**
609      * Called when the system server boots. If the system server is detected to be in a boot loop,
610      * query each observer and perform the mitigation action with the lowest user impact.
611      *
612      * Note: PackageWatchdog considers system_server restart loop as bootloop. Full reboots
613      * are not counted in bootloop.
614      * @hide
615      */
616     @SuppressWarnings("GuardedBy")
noteBoot()617     public void noteBoot() {
618         synchronized (sLock) {
619             // if boot count has reached threshold, start mitigation.
620             // We wait until threshold number of restarts only for the first time. Perform
621             // mitigations for every restart after that.
622             boolean mitigate = mBootThreshold.incrementAndTest();
623             if (mitigate) {
624                 int mitigationCount = mBootThreshold.getMitigationCount() + 1;
625                 ObserverInternal currentObserverToNotify = null;
626                 int currentObserverImpact = Integer.MAX_VALUE;
627                 for (int i = 0; i < mAllObservers.size(); i++) {
628                     final ObserverInternal observer = mAllObservers.valueAt(i);
629                     PackageHealthObserver registeredObserver = observer.registeredObserver;
630                     if (registeredObserver != null) {
631                         int impact = registeredObserver.onBootLoop(
632                                 observer.getBootMitigationCount() + 1);
633                         if (impact != PackageHealthObserverImpact.USER_IMPACT_LEVEL_0
634                                 && impact < currentObserverImpact) {
635                             currentObserverToNotify = observer;
636                             currentObserverImpact = impact;
637                         }
638                     }
639                 }
640 
641                 if (currentObserverToNotify != null) {
642                     PackageHealthObserver registeredObserver =
643                             currentObserverToNotify.registeredObserver;
644                     int currentObserverMitigationCount =
645                             currentObserverToNotify.getBootMitigationCount() + 1;
646                     currentObserverToNotify.setBootMitigationCount(
647                             currentObserverMitigationCount);
648                     saveAllObserversBootMitigationCountToMetadata(METADATA_FILE);
649                     currentObserverToNotify.observerExecutor
650                             .execute(() -> registeredObserver.onExecuteBootLoopMitigation(
651                                     currentObserverMitigationCount));
652                 }
653             }
654         }
655     }
656 
657     // TODO(b/120598832): Optimize write? Maybe only write a separate smaller file? Also
658     // avoid holding lock?
659     // This currently adds about 7ms extra to shutdown thread
660     /** @hide Writes the package information to file during shutdown. */
writeNow()661     public void writeNow() {
662         synchronized (sLock) {
663             // Must only run synchronous tasks as this runs on the ShutdownThread and no other
664             // thread is guaranteed to run during shutdown.
665             if (!mAllObservers.isEmpty()) {
666                 mLongTaskHandler.removeCallbacks(mSaveToFile);
667                 pruneObserversLocked();
668                 saveToFile();
669                 Slog.i(TAG, "Last write to update package durations");
670             }
671         }
672     }
673 
674     /**
675      * Enables or disables explicit health checks.
676      * <p> If explicit health checks are enabled, the health check service is started.
677      * <p> If explicit health checks are disabled, pending explicit health check requests are
678      * passed and the health check service is stopped.
679      */
setExplicitHealthCheckEnabled(boolean enabled)680     private void setExplicitHealthCheckEnabled(boolean enabled) {
681         synchronized (sLock) {
682             mIsHealthCheckEnabled = enabled;
683             mHealthCheckController.setEnabled(enabled);
684             mSyncRequired = true;
685             // Prune to update internal state whenever health check is enabled/disabled
686             syncState("health check state " + (enabled ? "enabled" : "disabled"));
687         }
688     }
689 
690     /**
691      * This method should be only called on mShortTaskHandler, since it modifies
692      * {@link #mNumberOfNativeCrashPollsRemaining}.
693      */
checkAndMitigateNativeCrashes()694     private void checkAndMitigateNativeCrashes() {
695         mNumberOfNativeCrashPollsRemaining--;
696         // Check if native watchdog reported a crash
697         if ("1".equals(SystemProperties.get("sys.init.updatable_crashing"))) {
698             // We rollback all available low impact rollbacks when crash is unattributable
699             notifyPackageFailure(Collections.EMPTY_LIST, FAILURE_REASON_NATIVE_CRASH);
700             // we stop polling after an attempt to execute rollback, regardless of whether the
701             // attempt succeeds or not
702         } else {
703             if (mNumberOfNativeCrashPollsRemaining > 0) {
704                 mShortTaskHandler.postDelayed(() -> checkAndMitigateNativeCrashes(),
705                         NATIVE_CRASH_POLLING_INTERVAL_MILLIS);
706             }
707         }
708     }
709 
710     /**
711      * Since this method can eventually trigger a rollback, it should be called
712      * only once boot has completed {@code onBootCompleted} and not earlier, because the install
713      * session must be entirely completed before we try to rollback.
714      * @hide
715      */
scheduleCheckAndMitigateNativeCrashes()716     public void scheduleCheckAndMitigateNativeCrashes() {
717         Slog.i(TAG, "Scheduling " + mNumberOfNativeCrashPollsRemaining + " polls to check "
718                 + "and mitigate native crashes");
719         mShortTaskHandler.post(()->checkAndMitigateNativeCrashes());
720     }
721 
getUserImpactLevelLimit()722     private int getUserImpactLevelLimit() {
723         return SystemProperties.getInt(MAJOR_USER_IMPACT_LEVEL_THRESHOLD,
724                 DEFAULT_MAJOR_USER_IMPACT_LEVEL_THRESHOLD);
725     }
726 
getPackagesExemptFromImpactLevelThreshold()727     private Set<String> getPackagesExemptFromImpactLevelThreshold() {
728         if (mPackagesExemptFromImpactLevelThreshold.isEmpty()) {
729             String packageNames = SystemProperties.get(PACKAGES_EXEMPT_FROM_IMPACT_LEVEL_THRESHOLD,
730                     DEFAULT_PACKAGES_EXEMPT_FROM_IMPACT_LEVEL_THRESHOLD);
731             return Set.of(packageNames.split("\\s*,\\s*"));
732         }
733         return mPackagesExemptFromImpactLevelThreshold;
734     }
735 
736     /**
737      * Indicates that a mitigation was successfully triggered or executed during
738      * {@link PackageHealthObserver#onExecuteHealthCheckMitigation} or
739      * {@link PackageHealthObserver#onExecuteBootLoopMitigation}.
740      */
741     public static final int MITIGATION_RESULT_SUCCESS =
742             ObserverMitigationResult.MITIGATION_RESULT_SUCCESS;
743 
744     /**
745      * Indicates that a mitigation executed during
746      * {@link PackageHealthObserver#onExecuteHealthCheckMitigation} or
747      * {@link PackageHealthObserver#onExecuteBootLoopMitigation} was skipped.
748      */
749     public static final int MITIGATION_RESULT_SKIPPED =
750             ObserverMitigationResult.MITIGATION_RESULT_SKIPPED;
751 
752 
753     /**
754      * Possible return values of the for mitigations executed during
755      * {@link PackageHealthObserver#onExecuteHealthCheckMitigation} and
756      * {@link PackageHealthObserver#onExecuteBootLoopMitigation}.
757      * @hide
758      */
759     @Retention(SOURCE)
760     @IntDef(prefix = "MITIGATION_RESULT_", value = {
761             ObserverMitigationResult.MITIGATION_RESULT_SUCCESS,
762             ObserverMitigationResult.MITIGATION_RESULT_SKIPPED,
763             })
764     public @interface ObserverMitigationResult {
765         int MITIGATION_RESULT_SUCCESS = 1;
766         int MITIGATION_RESULT_SKIPPED = 2;
767     }
768 
769     /**
770      * The minimum value that can be returned by any observer.
771      * It represents that no mitigations were available.
772      */
773     public static final int USER_IMPACT_THRESHOLD_NONE =
774             PackageHealthObserverImpact.USER_IMPACT_LEVEL_0;
775 
776     /**
777      * The mitigation impact beyond which the user will start noticing the mitigations.
778      */
779     public static final int USER_IMPACT_THRESHOLD_MEDIUM =
780             PackageHealthObserverImpact.USER_IMPACT_LEVEL_20;
781 
782     /**
783      * The mitigation impact beyond which the user impact is severely high.
784      */
785     public static final int USER_IMPACT_THRESHOLD_HIGH =
786             PackageHealthObserverImpact.USER_IMPACT_LEVEL_71;
787 
788     /**
789      * Possible severity values of the user impact of a
790      * {@link PackageHealthObserver#onExecuteHealthCheckMitigation}.
791      * @hide
792      */
793     @Retention(SOURCE)
794     @IntDef(value = {PackageHealthObserverImpact.USER_IMPACT_LEVEL_0,
795                      PackageHealthObserverImpact.USER_IMPACT_LEVEL_10,
796                      PackageHealthObserverImpact.USER_IMPACT_LEVEL_20,
797                      PackageHealthObserverImpact.USER_IMPACT_LEVEL_30,
798                      PackageHealthObserverImpact.USER_IMPACT_LEVEL_40,
799                      PackageHealthObserverImpact.USER_IMPACT_LEVEL_50,
800                      PackageHealthObserverImpact.USER_IMPACT_LEVEL_70,
801                      PackageHealthObserverImpact.USER_IMPACT_LEVEL_71,
802                      PackageHealthObserverImpact.USER_IMPACT_LEVEL_75,
803                      PackageHealthObserverImpact.USER_IMPACT_LEVEL_80,
804                      PackageHealthObserverImpact.USER_IMPACT_LEVEL_90,
805                      PackageHealthObserverImpact.USER_IMPACT_LEVEL_100})
806     public @interface PackageHealthObserverImpact {
807         /** No action to take. */
808         int USER_IMPACT_LEVEL_0 = 0;
809         /* Action has low user impact, user of a device will barely notice. */
810         int USER_IMPACT_LEVEL_10 = 10;
811         /* Actions having medium user impact, user of a device will likely notice. */
812         int USER_IMPACT_LEVEL_20 = 20;
813         int USER_IMPACT_LEVEL_30 = 30;
814         int USER_IMPACT_LEVEL_40 = 40;
815         int USER_IMPACT_LEVEL_50 = 50;
816         int USER_IMPACT_LEVEL_70 = 70;
817         /* Action has high user impact, a last resort, user of a device will be very frustrated. */
818         int USER_IMPACT_LEVEL_71 = 71;
819         int USER_IMPACT_LEVEL_75 = 75;
820         int USER_IMPACT_LEVEL_80 = 80;
821         int USER_IMPACT_LEVEL_90 = 90;
822         int USER_IMPACT_LEVEL_100 = 100;
823     }
824 
825     /** Register instances of this interface to receive notifications on package failure. */
826     @SuppressLint({"CallbackName"})
827     public interface PackageHealthObserver {
828         /**
829          * Called when health check fails for the {@code versionedPackage}.
830          * Note: if the returned user impact is higher than {@link #USER_IMPACT_THRESHOLD_HIGH},
831          * then {@link #onExecuteHealthCheckMitigation} would be called only in severe device
832          * conditions like boot-loop or network failure.
833          *
834          * @param versionedPackage the package that is failing. This may be null if a native
835          *                          service is crashing.
836          * @param failureReason   the type of failure that is occurring.
837          * @param mitigationCount the number of times mitigation has been called for this package
838          *                        (including this time).
839          *
840          * @return any value greater than {@link #USER_IMPACT_THRESHOLD_NONE} to express
841          * the impact of mitigation on the user in {@link #onExecuteHealthCheckMitigation}.
842          * Returning {@link #USER_IMPACT_THRESHOLD_NONE} would indicate no mitigations available.
843          */
onHealthCheckFailed( @ullable VersionedPackage versionedPackage, @FailureReasons int failureReason, int mitigationCount)844         @PackageHealthObserverImpact int onHealthCheckFailed(
845                 @Nullable VersionedPackage versionedPackage,
846                 @FailureReasons int failureReason,
847                 int mitigationCount);
848 
849         /**
850          * This would be called after {@link #onHealthCheckFailed}.
851          * This is called only if current observer returned least impact mitigation for failed
852          * health check.
853          *
854          * @param versionedPackage the package that is failing. This may be null if a native
855          *                         service is crashing.
856          * @param failureReason    the type of failure that is occurring.
857          * @param mitigationCount the number of times mitigation has been called for this package
858          *                         (including this time).
859          * @return {@link #MITIGATION_RESULT_SUCCESS} if the mitigation was successful,
860          *         or {@link #MITIGATION_RESULT_SKIPPED} if the mitigation was skipped.
861          */
onExecuteHealthCheckMitigation( @ullable VersionedPackage versionedPackage, @FailureReasons int failureReason, int mitigationCount)862         @ObserverMitigationResult int onExecuteHealthCheckMitigation(
863                 @Nullable VersionedPackage versionedPackage,
864                 @FailureReasons int failureReason, int mitigationCount);
865 
866 
867         /**
868          * Called when the system server has booted several times within a window of time, defined
869          * by {@link #mBootThreshold}
870          *
871          * @param mitigationCount the number of times mitigation has been attempted for this
872          *                        boot loop (including this time).
873          *
874          * @return any value greater than {@link #USER_IMPACT_THRESHOLD_NONE} to express
875          * the impact of mitigation on the user in {@link #onExecuteBootLoopMitigation}.
876          * Returning {@link #USER_IMPACT_THRESHOLD_NONE} would indicate no mitigations available.
877          */
onBootLoop(int mitigationCount)878         default @PackageHealthObserverImpact int onBootLoop(int mitigationCount) {
879             return PackageHealthObserverImpact.USER_IMPACT_LEVEL_0;
880         }
881 
882         /**
883          * This would be called after {@link #onBootLoop}.
884          * This is called only if current observer returned least impact mitigation for fixing
885          * boot loop.
886          *
887          * @param mitigationCount the number of times mitigation has been attempted for this
888          *                        boot loop (including this time).
889          *
890          * @return {@link #MITIGATION_RESULT_SUCCESS} if the mitigation was successful,
891          *         or {@link #MITIGATION_RESULT_SKIPPED} if the mitigation was skipped.
892          */
onExecuteBootLoopMitigation(int mitigationCount)893         default @ObserverMitigationResult int onExecuteBootLoopMitigation(int mitigationCount) {
894             return ObserverMitigationResult.MITIGATION_RESULT_SKIPPED;
895         }
896 
897         // TODO(b/120598832): Ensure uniqueness?
898         /**
899          * Identifier for the observer, should not change across device updates otherwise the
900          * watchdog may drop observing packages with the old name.
901          */
getUniqueIdentifier()902         @NonNull String getUniqueIdentifier();
903 
904         /**
905          * An observer will not be pruned if this is set, even if the observer is not explicitly
906          * monitoring any packages.
907          */
isPersistent()908         default boolean isPersistent() {
909             return false;
910         }
911 
912         /**
913          * Returns {@code true} if this observer wishes to observe the given package, {@code false}
914          * otherwise.
915          * Any failing package can be passed on to the observer. Currently the packages that have
916          * ANRs and perform {@link android.service.watchdog.ExplicitHealthCheckService} are being
917          * passed to observers in these API.
918          *
919          * <p> A persistent observer may choose to start observing certain failing packages, even if
920          * it has not explicitly asked to watch the package with {@link #startExplicitHealthCheck}.
921          */
mayObservePackage(@onNull String packageName)922         default boolean mayObservePackage(@NonNull String packageName) {
923             return false;
924         }
925     }
926 
927     @VisibleForTesting
getTriggerFailureCount()928     long getTriggerFailureCount() {
929         synchronized (sLock) {
930             return mTriggerFailureCount;
931         }
932     }
933 
934     @VisibleForTesting
getTriggerFailureDurationMs()935     long getTriggerFailureDurationMs() {
936         synchronized (sLock) {
937             return mTriggerFailureDurationMs;
938         }
939     }
940 
941     /**
942      * Serializes and syncs health check requests with the {@link ExplicitHealthCheckController}.
943      */
syncRequestsAsync()944     private void syncRequestsAsync() {
945         mShortTaskHandler.removeCallbacks(mSyncRequests);
946         mShortTaskHandler.post(mSyncRequests);
947     }
948 
949     /**
950      * Syncs health check requests with the {@link ExplicitHealthCheckController}.
951      * Calls to this must be serialized.
952      *
953      * @see #syncRequestsAsync
954      */
syncRequests()955     private void syncRequests() {
956         boolean syncRequired = false;
957         synchronized (sLock) {
958             if (mIsPackagesReady) {
959                 Set<String> packages = getPackagesPendingHealthChecksLocked();
960                 if (mSyncRequired || !packages.equals(mRequestedHealthCheckPackages)
961                         || packages.isEmpty()) {
962                     syncRequired = true;
963                     mRequestedHealthCheckPackages = packages;
964                 }
965             } // else, we will sync requests when packages become ready
966         }
967 
968         // Call outside lock to avoid holding lock when calling into the controller.
969         if (syncRequired) {
970             Slog.i(TAG, "Syncing health check requests for packages: "
971                     + mRequestedHealthCheckPackages);
972             mHealthCheckController.syncRequests(mRequestedHealthCheckPackages);
973             mSyncRequired = false;
974         }
975     }
976 
977     /**
978      * Updates the observers monitoring {@code packageName} that explicit health check has passed.
979      *
980      * <p> This update is strictly for registered observers at the time of the call
981      * Observers that register after this signal will have no knowledge of prior signals and will
982      * effectively behave as if the explicit health check hasn't passed for {@code packageName}.
983      *
984      * <p> {@code packageName} can still be considered failed if reported by
985      * {@link #notifyPackageFailureLocked} before the package expires.
986      *
987      * <p> Triggered by components outside the system server when they are fully functional after an
988      * update.
989      */
onHealthCheckPassed(String packageName)990     private void onHealthCheckPassed(String packageName) {
991         Slog.i(TAG, "Health check passed for package: " + packageName);
992         boolean isStateChanged = false;
993 
994         synchronized (sLock) {
995             for (int observerIdx = 0; observerIdx < mAllObservers.size(); observerIdx++) {
996                 ObserverInternal observer = mAllObservers.valueAt(observerIdx);
997                 MonitoredPackage monitoredPackage = observer.getMonitoredPackage(packageName);
998 
999                 if (monitoredPackage != null) {
1000                     int oldState = monitoredPackage.getHealthCheckStateLocked();
1001                     int newState = monitoredPackage.tryPassHealthCheckLocked();
1002                     isStateChanged |= oldState != newState;
1003                 }
1004             }
1005         }
1006 
1007         if (isStateChanged) {
1008             syncState("health check passed for " + packageName);
1009         }
1010     }
1011 
onSupportedPackages(List<PackageConfig> supportedPackages)1012     private void onSupportedPackages(List<PackageConfig> supportedPackages) {
1013         boolean isStateChanged = false;
1014 
1015         Map<String, Long> supportedPackageTimeouts = new ArrayMap<>();
1016         Iterator<PackageConfig> it = supportedPackages.iterator();
1017         while (it.hasNext()) {
1018             PackageConfig info = it.next();
1019             supportedPackageTimeouts.put(info.getPackageName(), info.getHealthCheckTimeoutMillis());
1020         }
1021 
1022         synchronized (sLock) {
1023             Slog.d(TAG, "Received supported packages " + supportedPackages);
1024             Iterator<ObserverInternal> oit = mAllObservers.values().iterator();
1025             while (oit.hasNext()) {
1026                 Iterator<MonitoredPackage> pit = oit.next().getMonitoredPackages()
1027                         .values().iterator();
1028                 while (pit.hasNext()) {
1029                     MonitoredPackage monitoredPackage = pit.next();
1030                     String packageName = monitoredPackage.getName();
1031                     int oldState = monitoredPackage.getHealthCheckStateLocked();
1032                     int newState;
1033 
1034                     if (supportedPackageTimeouts.containsKey(packageName)) {
1035                         // Supported packages become ACTIVE if currently INACTIVE
1036                         newState = monitoredPackage.setHealthCheckActiveLocked(
1037                                 supportedPackageTimeouts.get(packageName));
1038                     } else {
1039                         // Unsupported packages are marked as PASSED unless already FAILED
1040                         newState = monitoredPackage.tryPassHealthCheckLocked();
1041                     }
1042                     isStateChanged |= oldState != newState;
1043                 }
1044             }
1045         }
1046 
1047         if (isStateChanged) {
1048             syncState("updated health check supported packages " + supportedPackages);
1049         }
1050     }
1051 
onSyncRequestNotified()1052     private void onSyncRequestNotified() {
1053         synchronized (sLock) {
1054             mSyncRequired = true;
1055             syncRequestsAsync();
1056         }
1057     }
1058 
1059     @GuardedBy("sLock")
getPackagesPendingHealthChecksLocked()1060     private Set<String> getPackagesPendingHealthChecksLocked() {
1061         Set<String> packages = new ArraySet<>();
1062         Iterator<ObserverInternal> oit = mAllObservers.values().iterator();
1063         while (oit.hasNext()) {
1064             ObserverInternal observer = oit.next();
1065             Iterator<MonitoredPackage> pit =
1066                     observer.getMonitoredPackages().values().iterator();
1067             while (pit.hasNext()) {
1068                 MonitoredPackage monitoredPackage = pit.next();
1069                 String packageName = monitoredPackage.getName();
1070                 if (monitoredPackage.isPendingHealthChecksLocked()) {
1071                     packages.add(packageName);
1072                 }
1073             }
1074         }
1075         return packages;
1076     }
1077 
1078     /**
1079      * Syncs the state of the observers.
1080      *
1081      * <p> Prunes all observers, saves new state to disk, syncs health check requests with the
1082      * health check service and schedules the next state sync.
1083      */
syncState(String reason)1084     private void syncState(String reason) {
1085         synchronized (sLock) {
1086             Slog.i(TAG, "Syncing state, reason: " + reason);
1087             pruneObserversLocked();
1088 
1089             saveToFileAsync();
1090             syncRequestsAsync();
1091 
1092             // Done syncing state, schedule the next state sync
1093             scheduleNextSyncStateLocked();
1094         }
1095     }
1096 
syncStateWithScheduledReason()1097     private void syncStateWithScheduledReason() {
1098         syncState("scheduled");
1099     }
1100 
1101     @GuardedBy("sLock")
scheduleNextSyncStateLocked()1102     private void scheduleNextSyncStateLocked() {
1103         long durationMs = getNextStateSyncMillisLocked();
1104         mShortTaskHandler.removeCallbacks(mSyncStateWithScheduledReason);
1105         if (durationMs == Long.MAX_VALUE) {
1106             Slog.i(TAG, "Cancelling state sync, nothing to sync");
1107             mUptimeAtLastStateSync = 0;
1108         } else {
1109             mUptimeAtLastStateSync = mSystemClock.uptimeMillis();
1110             mShortTaskHandler.postDelayed(mSyncStateWithScheduledReason, durationMs);
1111         }
1112     }
1113 
1114     /**
1115      * Returns the next duration in millis to sync the watchdog state.
1116      *
1117      * @returns Long#MAX_VALUE if there are no observed packages.
1118      */
1119     @GuardedBy("sLock")
getNextStateSyncMillisLocked()1120     private long getNextStateSyncMillisLocked() {
1121         long shortestDurationMs = Long.MAX_VALUE;
1122         for (int oIndex = 0; oIndex < mAllObservers.size(); oIndex++) {
1123             ArrayMap<String, MonitoredPackage> packages = mAllObservers.valueAt(oIndex)
1124                     .getMonitoredPackages();
1125             for (int pIndex = 0; pIndex < packages.size(); pIndex++) {
1126                 MonitoredPackage mp = packages.valueAt(pIndex);
1127                 long duration = mp.getShortestScheduleDurationMsLocked();
1128                 if (duration < shortestDurationMs) {
1129                     shortestDurationMs = duration;
1130                 }
1131             }
1132         }
1133         return shortestDurationMs;
1134     }
1135 
1136     /**
1137      * Removes {@code elapsedMs} milliseconds from all durations on monitored packages
1138      * and updates other internal state.
1139      */
1140     @GuardedBy("sLock")
pruneObserversLocked()1141     private void pruneObserversLocked() {
1142         long elapsedMs = mUptimeAtLastStateSync == 0
1143                 ? 0 : mSystemClock.uptimeMillis() - mUptimeAtLastStateSync;
1144         if (elapsedMs <= 0) {
1145             Slog.i(TAG, "Not pruning observers, elapsed time: " + elapsedMs + "ms");
1146             return;
1147         }
1148 
1149         Iterator<ObserverInternal> it = mAllObservers.values().iterator();
1150         while (it.hasNext()) {
1151             ObserverInternal observer = it.next();
1152             Set<MonitoredPackage> failedPackages =
1153                     observer.prunePackagesLocked(elapsedMs);
1154             if (!failedPackages.isEmpty()) {
1155                 onHealthCheckFailed(observer, failedPackages);
1156             }
1157             if (observer.getMonitoredPackages().isEmpty() && (observer.registeredObserver == null
1158                     || !observer.registeredObserver.isPersistent())) {
1159                 Slog.i(TAG, "Discarding observer " + observer.name + ". All packages expired");
1160                 it.remove();
1161             }
1162         }
1163     }
1164 
onHealthCheckFailed(ObserverInternal observer, Set<MonitoredPackage> failedPackages)1165     private void onHealthCheckFailed(ObserverInternal observer,
1166             Set<MonitoredPackage> failedPackages) {
1167         mLongTaskHandler.post(() -> {
1168             synchronized (sLock) {
1169                 PackageHealthObserver registeredObserver = observer.registeredObserver;
1170                 if (registeredObserver != null) {
1171                     Iterator<MonitoredPackage> it = failedPackages.iterator();
1172                     while (it.hasNext()) {
1173                         VersionedPackage versionedPkg = getVersionedPackage(it.next().getName());
1174                         if (versionedPkg != null) {
1175                             Slog.i(TAG,
1176                                     "Explicit health check failed for package " + versionedPkg);
1177                             observer.observerExecutor.execute(() ->
1178                                     registeredObserver.onExecuteHealthCheckMitigation(versionedPkg,
1179                                             PackageWatchdog.FAILURE_REASON_EXPLICIT_HEALTH_CHECK,
1180                                             1));
1181                         }
1182                     }
1183                 }
1184             }
1185         });
1186     }
1187 
1188     /**
1189      * Gets PackageInfo for the given package. Matches any user and apex.
1190      *
1191      * @throws PackageManager.NameNotFoundException if no such package is installed.
1192      */
getPackageInfo(String packageName)1193     private PackageInfo getPackageInfo(String packageName)
1194             throws PackageManager.NameNotFoundException {
1195         PackageManager pm = mContext.getPackageManager();
1196         try {
1197             // The MATCH_ANY_USER flag doesn't mix well with the MATCH_APEX
1198             // flag, so make two separate attempts to get the package info.
1199             // We don't need both flags at the same time because we assume
1200             // apex files are always installed for all users.
1201             return pm.getPackageInfo(packageName, PackageManager.MATCH_ANY_USER);
1202         } catch (PackageManager.NameNotFoundException e) {
1203             return pm.getPackageInfo(packageName, PackageManager.MATCH_APEX);
1204         }
1205     }
1206 
1207     @Nullable
getVersionedPackage(String packageName)1208     private VersionedPackage getVersionedPackage(String packageName) {
1209         final PackageManager pm = mContext.getPackageManager();
1210         if (pm == null || TextUtils.isEmpty(packageName)) {
1211             return null;
1212         }
1213         try {
1214             final long versionCode = getPackageInfo(packageName).getLongVersionCode();
1215             return new VersionedPackage(packageName, versionCode);
1216         } catch (PackageManager.NameNotFoundException e) {
1217             return null;
1218         }
1219     }
1220 
1221     /**
1222      * Loads mAllObservers from file.
1223      *
1224      * <p>Note that this is <b>not</b> thread safe and should only called be called
1225      * from the constructor.
1226      */
loadFromFile()1227     private void loadFromFile() {
1228         InputStream infile = null;
1229         mAllObservers.clear();
1230         try {
1231             infile = mPolicyFile.openRead();
1232             final XmlPullParser parser = Xml.newPullParser();
1233             parser.setInput(infile, UTF_8.name());
1234             XmlUtils.beginDocument(parser, TAG_PACKAGE_WATCHDOG);
1235             int outerDepth = parser.getDepth();
1236             while (XmlUtils.nextElementWithin(parser, outerDepth)) {
1237                 ObserverInternal observer = ObserverInternal.read(parser, this);
1238                 if (observer != null) {
1239                     mAllObservers.put(observer.name, observer);
1240                 }
1241             }
1242         } catch (FileNotFoundException e) {
1243             // Nothing to monitor
1244         } catch (Exception e) {
1245             Slog.wtf(TAG, "Unable to read monitored packages, deleting file", e);
1246             mPolicyFile.delete();
1247         } finally {
1248             IoUtils.closeQuietly(infile);
1249         }
1250     }
1251 
onPropertyChanged(DeviceConfig.Properties properties)1252     private void onPropertyChanged(DeviceConfig.Properties properties) {
1253         try {
1254             updateConfigs();
1255         } catch (Exception ignore) {
1256             Slog.w(TAG, "Failed to reload device config changes");
1257         }
1258     }
1259 
1260     /** Adds a {@link DeviceConfig#OnPropertiesChangedListener}. */
setPropertyChangedListenerLocked()1261     private void setPropertyChangedListenerLocked() {
1262         DeviceConfig.addOnPropertiesChangedListener(
1263                 DeviceConfig.NAMESPACE_ROLLBACK,
1264                 mContext.getMainExecutor(),
1265                 mOnPropertyChangedListener);
1266     }
1267 
1268     @VisibleForTesting
removePropertyChangedListener()1269     void removePropertyChangedListener() {
1270         DeviceConfig.removeOnPropertiesChangedListener(mOnPropertyChangedListener);
1271     }
1272 
1273     /**
1274      * Health check is enabled or disabled after reading the flags
1275      * from DeviceConfig.
1276      */
1277     @VisibleForTesting
updateConfigs()1278     void updateConfigs() {
1279         synchronized (sLock) {
1280             mTriggerFailureCount = DeviceConfig.getInt(
1281                     DeviceConfig.NAMESPACE_ROLLBACK,
1282                     PROPERTY_WATCHDOG_TRIGGER_FAILURE_COUNT,
1283                     DEFAULT_TRIGGER_FAILURE_COUNT);
1284             if (mTriggerFailureCount <= 0) {
1285                 mTriggerFailureCount = DEFAULT_TRIGGER_FAILURE_COUNT;
1286             }
1287 
1288             mTriggerFailureDurationMs = DeviceConfig.getInt(
1289                     DeviceConfig.NAMESPACE_ROLLBACK,
1290                     PROPERTY_WATCHDOG_TRIGGER_DURATION_MILLIS,
1291                     DEFAULT_TRIGGER_FAILURE_DURATION_MS);
1292             if (mTriggerFailureDurationMs <= 0) {
1293                 mTriggerFailureDurationMs = DEFAULT_TRIGGER_FAILURE_DURATION_MS;
1294             }
1295 
1296             setExplicitHealthCheckEnabled(DeviceConfig.getBoolean(
1297                     DeviceConfig.NAMESPACE_ROLLBACK,
1298                     PROPERTY_WATCHDOG_EXPLICIT_HEALTH_CHECK_ENABLED,
1299                     DEFAULT_EXPLICIT_HEALTH_CHECK_ENABLED));
1300         }
1301     }
1302 
1303     /**
1304      * Persists mAllObservers to file. Threshold information is ignored.
1305      */
saveToFile()1306     private boolean saveToFile() {
1307         Slog.i(TAG, "Saving observer state to file");
1308         synchronized (sLock) {
1309             FileOutputStream stream;
1310             try {
1311                 stream = mPolicyFile.startWrite();
1312             } catch (IOException e) {
1313                 Slog.w(TAG, "Cannot update monitored packages", e);
1314                 return false;
1315             }
1316 
1317             try {
1318                 XmlSerializer out = new FastXmlSerializer();
1319                 out.setOutput(stream, UTF_8.name());
1320                 out.startDocument(null, true);
1321                 out.startTag(null, TAG_PACKAGE_WATCHDOG);
1322                 out.attribute(null, ATTR_VERSION, Integer.toString(DB_VERSION));
1323                 for (int oIndex = 0; oIndex < mAllObservers.size(); oIndex++) {
1324                     mAllObservers.valueAt(oIndex).writeLocked(out);
1325                 }
1326                 out.endTag(null, TAG_PACKAGE_WATCHDOG);
1327                 out.endDocument();
1328                 mPolicyFile.finishWrite(stream);
1329                 return true;
1330             } catch (IOException e) {
1331                 Slog.w(TAG, "Failed to save monitored packages, restoring backup", e);
1332                 mPolicyFile.failWrite(stream);
1333                 return false;
1334             }
1335         }
1336     }
1337 
saveToFileAsync()1338     private void saveToFileAsync() {
1339         if (!mLongTaskHandler.hasCallbacks(mSaveToFile)) {
1340             mLongTaskHandler.post(mSaveToFile);
1341         }
1342     }
1343 
1344     /** @hide Convert a {@code LongArrayQueue} to a String of comma-separated values. */
longArrayQueueToString(LongArrayQueue queue)1345     public static String longArrayQueueToString(LongArrayQueue queue) {
1346         if (queue.size() > 0) {
1347             StringBuilder sb = new StringBuilder();
1348             sb.append(queue.get(0));
1349             for (int i = 1; i < queue.size(); i++) {
1350                 sb.append(",");
1351                 sb.append(queue.get(i));
1352             }
1353             return sb.toString();
1354         }
1355         return "";
1356     }
1357 
1358     /** @hide Parse a comma-separated String of longs into a LongArrayQueue. */
parseLongArrayQueue(String commaSeparatedValues)1359     public static LongArrayQueue parseLongArrayQueue(String commaSeparatedValues) {
1360         LongArrayQueue result = new LongArrayQueue();
1361         if (!TextUtils.isEmpty(commaSeparatedValues)) {
1362             String[] values = commaSeparatedValues.split(",");
1363             for (String value : values) {
1364                 result.addLast(Long.parseLong(value));
1365             }
1366         }
1367         return result;
1368     }
1369 
1370 
1371     /** Dump status of every observer in mAllObservers. */
dump(@onNull PrintWriter pw)1372     public void dump(@NonNull PrintWriter pw) {
1373         if (Flags.synchronousRebootInRescueParty() && isRecoveryTriggeredReboot()) {
1374             dumpInternal(pw);
1375         } else {
1376             synchronized (sLock) {
1377                 dumpInternal(pw);
1378             }
1379         }
1380     }
1381 
1382     /**
1383      * Check if we're currently attempting to reboot during mitigation. This method must return
1384      * true if triggered reboot early during a boot loop, since the device will not be fully booted
1385      * at this time.
1386      */
isRecoveryTriggeredReboot()1387     public static boolean isRecoveryTriggeredReboot() {
1388         return isFactoryResetPropertySet() || isRebootPropertySet();
1389     }
1390 
isFactoryResetPropertySet()1391     private static boolean isFactoryResetPropertySet() {
1392         return CrashRecoveryProperties.attemptingFactoryReset().orElse(false);
1393     }
1394 
isRebootPropertySet()1395     private static boolean isRebootPropertySet() {
1396         return CrashRecoveryProperties.attemptingReboot().orElse(false);
1397     }
1398 
dumpInternal(@onNull PrintWriter pw)1399     private void dumpInternal(@NonNull PrintWriter pw) {
1400         IndentingPrintWriter ipw = new IndentingPrintWriter(pw, "  ");
1401         ipw.println("Package Watchdog status");
1402         ipw.increaseIndent();
1403         synchronized (sLock) {
1404             for (String observerName : mAllObservers.keySet()) {
1405                 ipw.println("Observer name: " + observerName);
1406                 ipw.increaseIndent();
1407                 ObserverInternal observerInternal = mAllObservers.get(observerName);
1408                 observerInternal.dump(ipw);
1409                 ipw.decreaseIndent();
1410             }
1411         }
1412         ipw.decreaseIndent();
1413         dumpCrashRecoveryEvents(ipw);
1414     }
1415 
1416     @VisibleForTesting
1417     @GuardedBy("sLock")
registerObserverInternal(ObserverInternal observerInternal)1418     void registerObserverInternal(ObserverInternal observerInternal) {
1419         mAllObservers.put(observerInternal.name, observerInternal);
1420     }
1421 
1422     /**
1423      * Represents an observer monitoring a set of packages along with the failure thresholds for
1424      * each package.
1425      *
1426      * <p> Note, the PackageWatchdog#sLock must always be held when reading or writing
1427      * instances of this class.
1428      */
1429     static class ObserverInternal {
1430         public final String name;
1431         @GuardedBy("sLock")
1432         private final ArrayMap<String, MonitoredPackage> mPackages = new ArrayMap<>();
1433         @Nullable
1434         @GuardedBy("sLock")
1435         public PackageHealthObserver registeredObserver;
1436         public Executor observerExecutor;
1437         private int mMitigationCount;
1438 
ObserverInternal(String name, List<MonitoredPackage> packages)1439         ObserverInternal(String name, List<MonitoredPackage> packages) {
1440             this(name, packages, /*mitigationCount=*/ 0);
1441         }
1442 
ObserverInternal(String name, List<MonitoredPackage> packages, int mitigationCount)1443         ObserverInternal(String name, List<MonitoredPackage> packages, int mitigationCount) {
1444             this.name = name;
1445             updatePackagesLocked(packages);
1446             this.mMitigationCount = mitigationCount;
1447         }
1448 
1449         /**
1450          * Writes important {@link MonitoredPackage} details for this observer to file.
1451          * Does not persist any package failure thresholds.
1452          */
1453         @GuardedBy("sLock")
writeLocked(XmlSerializer out)1454         public boolean writeLocked(XmlSerializer out) {
1455             try {
1456                 out.startTag(null, TAG_OBSERVER);
1457                 out.attribute(null, ATTR_NAME, name);
1458                 out.attribute(null, ATTR_MITIGATION_COUNT, Integer.toString(mMitigationCount));
1459                 for (int i = 0; i < mPackages.size(); i++) {
1460                     MonitoredPackage p = mPackages.valueAt(i);
1461                     p.writeLocked(out);
1462                 }
1463                 out.endTag(null, TAG_OBSERVER);
1464                 return true;
1465             } catch (IOException e) {
1466                 Slog.w(TAG, "Cannot save observer", e);
1467                 return false;
1468             }
1469         }
1470 
getBootMitigationCount()1471         public int getBootMitigationCount() {
1472             return mMitigationCount;
1473         }
1474 
setBootMitigationCount(int mitigationCount)1475         public void setBootMitigationCount(int mitigationCount) {
1476             mMitigationCount = mitigationCount;
1477         }
1478 
1479         @GuardedBy("sLock")
updatePackagesLocked(List<MonitoredPackage> packages)1480         public void updatePackagesLocked(List<MonitoredPackage> packages) {
1481             for (int pIndex = 0; pIndex < packages.size(); pIndex++) {
1482                 MonitoredPackage p = packages.get(pIndex);
1483                 MonitoredPackage existingPackage = getMonitoredPackage(p.getName());
1484                 if (existingPackage != null) {
1485                     existingPackage.updateHealthCheckDuration(p.mDurationMs);
1486                 } else {
1487                     putMonitoredPackage(p);
1488                 }
1489             }
1490         }
1491 
1492         /**
1493          * Reduces the monitoring durations of all packages observed by this observer by
1494          * {@code elapsedMs}. If any duration is less than 0, the package is removed from
1495          * observation. If any health check duration is less than 0, the health check result
1496          * is evaluated.
1497          *
1498          * @return a {@link Set} of packages that were removed from the observer without explicit
1499          * health check passing, or an empty list if no package expired for which an explicit health
1500          * check was still pending
1501          */
1502         @GuardedBy("sLock")
prunePackagesLocked(long elapsedMs)1503         private Set<MonitoredPackage> prunePackagesLocked(long elapsedMs) {
1504             Set<MonitoredPackage> failedPackages = new ArraySet<>();
1505             Iterator<MonitoredPackage> it = mPackages.values().iterator();
1506             while (it.hasNext()) {
1507                 MonitoredPackage p = it.next();
1508                 int oldState = p.getHealthCheckStateLocked();
1509                 int newState = p.handleElapsedTimeLocked(elapsedMs);
1510                 if (oldState != HealthCheckState.FAILED
1511                         && newState == HealthCheckState.FAILED) {
1512                     Slog.i(TAG, "Package " + p.getName() + " failed health check");
1513                     failedPackages.add(p);
1514                 }
1515                 if (p.isExpiredLocked()) {
1516                     it.remove();
1517                 }
1518             }
1519             return failedPackages;
1520         }
1521 
1522         /**
1523          * Increments failure counts of {@code packageName}.
1524          * @returns {@code true} if failure threshold is exceeded, {@code false} otherwise
1525          * @hide
1526          */
1527         @GuardedBy("sLock")
notifyPackageFailureLocked(String packageName)1528         public boolean notifyPackageFailureLocked(String packageName) {
1529             if (getMonitoredPackage(packageName) == null && registeredObserver.isPersistent()
1530                     && registeredObserver.mayObservePackage(packageName)) {
1531                 putMonitoredPackage(sPackageWatchdog.newMonitoredPackage(
1532                         packageName, DEFAULT_OBSERVING_DURATION_MS, false));
1533             }
1534             MonitoredPackage p = getMonitoredPackage(packageName);
1535             if (p != null) {
1536                 return p.onFailureLocked();
1537             }
1538             return false;
1539         }
1540 
1541         /**
1542          * Returns the map of packages monitored by this observer.
1543          *
1544          * @return a mapping of package names to {@link MonitoredPackage} objects.
1545          */
1546         @GuardedBy("sLock")
getMonitoredPackages()1547         public ArrayMap<String, MonitoredPackage> getMonitoredPackages() {
1548             return mPackages;
1549         }
1550 
1551         /**
1552          * Returns the {@link MonitoredPackage} associated with a given package name if the
1553          * package is being monitored by this observer.
1554          *
1555          * @param packageName: the name of the package.
1556          * @return the {@link MonitoredPackage} object associated with the package name if one
1557          *         exists, {@code null} otherwise.
1558          */
1559         @GuardedBy("sLock")
1560         @Nullable
getMonitoredPackage(String packageName)1561         public MonitoredPackage getMonitoredPackage(String packageName) {
1562             return mPackages.get(packageName);
1563         }
1564 
1565         /**
1566          * Associates a {@link MonitoredPackage} with the observer.
1567          *
1568          * @param p: the {@link MonitoredPackage} to store.
1569          */
1570         @GuardedBy("sLock")
putMonitoredPackage(MonitoredPackage p)1571         public void putMonitoredPackage(MonitoredPackage p) {
1572             mPackages.put(p.getName(), p);
1573         }
1574 
1575         /**
1576          * Returns one ObserverInternal from the {@code parser} and advances its state.
1577          *
1578          * <p>Note that this method is <b>not</b> thread safe. It should only be called from
1579          * #loadFromFile which in turn is only called on construction of the
1580          * singleton PackageWatchdog.
1581          **/
read(XmlPullParser parser, PackageWatchdog watchdog)1582         public static ObserverInternal read(XmlPullParser parser, PackageWatchdog watchdog) {
1583             String observerName = null;
1584             int observerMitigationCount = 0;
1585             if (TAG_OBSERVER.equals(parser.getName())) {
1586                 observerName = parser.getAttributeValue(null, ATTR_NAME);
1587                 if (TextUtils.isEmpty(observerName)) {
1588                     Slog.wtf(TAG, "Unable to read observer name");
1589                     return null;
1590                 }
1591             }
1592             List<MonitoredPackage> packages = new ArrayList<>();
1593             int innerDepth = parser.getDepth();
1594             try {
1595                 try {
1596                     observerMitigationCount = Integer.parseInt(
1597                             parser.getAttributeValue(null, ATTR_MITIGATION_COUNT));
1598                 } catch (Exception e) {
1599                     Slog.i(
1600                         TAG,
1601                         "ObserverInternal mitigation count was not present.");
1602                 }
1603                 while (XmlUtils.nextElementWithin(parser, innerDepth)) {
1604                     if (TAG_PACKAGE.equals(parser.getName())) {
1605                         try {
1606                             MonitoredPackage pkg = watchdog.parseMonitoredPackage(parser);
1607                             if (pkg != null) {
1608                                 packages.add(pkg);
1609                             }
1610                         } catch (NumberFormatException e) {
1611                             Slog.wtf(TAG, "Skipping package for observer " + observerName, e);
1612                             continue;
1613                         }
1614                     }
1615                 }
1616             } catch (XmlPullParserException | IOException e) {
1617                 Slog.wtf(TAG, "Unable to read observer " + observerName, e);
1618                 return null;
1619             }
1620             if (packages.isEmpty()) {
1621                 return null;
1622             }
1623             return new ObserverInternal(observerName, packages, observerMitigationCount);
1624         }
1625 
1626         /** Dumps information about this observer and the packages it watches. */
dump(IndentingPrintWriter pw)1627         public void dump(IndentingPrintWriter pw) {
1628             boolean isPersistent = registeredObserver != null && registeredObserver.isPersistent();
1629             pw.println("Persistent: " + isPersistent);
1630             for (String packageName : mPackages.keySet()) {
1631                 MonitoredPackage p = getMonitoredPackage(packageName);
1632                 pw.println(packageName +  ": ");
1633                 pw.increaseIndent();
1634                 pw.println("# Failures: " + p.mFailureHistory.size());
1635                 pw.println("Monitoring duration remaining: " + p.mDurationMs + "ms");
1636                 pw.println("Explicit health check duration: " + p.mHealthCheckDurationMs + "ms");
1637                 pw.println("Health check state: " + p.toString(p.mHealthCheckState));
1638                 pw.decreaseIndent();
1639             }
1640         }
1641     }
1642 
1643     /** @hide */
1644     @Retention(SOURCE)
1645     @IntDef(value = {
1646             HealthCheckState.ACTIVE,
1647             HealthCheckState.INACTIVE,
1648             HealthCheckState.PASSED,
1649             HealthCheckState.FAILED})
1650     public @interface HealthCheckState {
1651         // The package has not passed health check but has requested a health check
1652         int ACTIVE = 0;
1653         // The package has not passed health check and has not requested a health check
1654         int INACTIVE = 1;
1655         // The package has passed health check
1656         int PASSED = 2;
1657         // The package has failed health check
1658         int FAILED = 3;
1659     }
1660 
newMonitoredPackage( String name, long durationMs, boolean hasPassedHealthCheck)1661     MonitoredPackage newMonitoredPackage(
1662             String name, long durationMs, boolean hasPassedHealthCheck) {
1663         return newMonitoredPackage(name, durationMs, Long.MAX_VALUE, hasPassedHealthCheck,
1664                 new LongArrayQueue());
1665     }
1666 
newMonitoredPackage(String name, long durationMs, long healthCheckDurationMs, boolean hasPassedHealthCheck, LongArrayQueue mitigationCalls)1667     MonitoredPackage newMonitoredPackage(String name, long durationMs, long healthCheckDurationMs,
1668             boolean hasPassedHealthCheck, LongArrayQueue mitigationCalls) {
1669         return new MonitoredPackage(name, durationMs, healthCheckDurationMs,
1670                 hasPassedHealthCheck, mitigationCalls);
1671     }
1672 
parseMonitoredPackage(XmlPullParser parser)1673     MonitoredPackage parseMonitoredPackage(XmlPullParser parser)
1674             throws XmlPullParserException {
1675         String packageName = parser.getAttributeValue(null, ATTR_NAME);
1676         long duration = Long.parseLong(parser.getAttributeValue(null, ATTR_DURATION));
1677         long healthCheckDuration = Long.parseLong(parser.getAttributeValue(null,
1678                 ATTR_EXPLICIT_HEALTH_CHECK_DURATION));
1679         boolean hasPassedHealthCheck = Boolean.parseBoolean(parser.getAttributeValue(null,
1680                 ATTR_PASSED_HEALTH_CHECK));
1681         LongArrayQueue mitigationCalls = parseLongArrayQueue(
1682                 parser.getAttributeValue(null, ATTR_MITIGATION_CALLS));
1683         return newMonitoredPackage(packageName,
1684                 duration, healthCheckDuration, hasPassedHealthCheck, mitigationCalls);
1685     }
1686 
1687     /**
1688      * Represents a package and its health check state along with the time
1689      * it should be monitored for.
1690      *
1691      * <p> Note, the PackageWatchdog#sLock must always be held when reading or writing
1692      * instances of this class.
1693      */
1694     class MonitoredPackage {
1695         private final String mPackageName;
1696         // Times when package failures happen sorted in ascending order
1697         @GuardedBy("sLock")
1698         private final LongArrayQueue mFailureHistory = new LongArrayQueue();
1699         // Times when an observer was called to mitigate this package's failure. Sorted in
1700         // ascending order.
1701         @GuardedBy("sLock")
1702         private final LongArrayQueue mMitigationCalls;
1703         // One of STATE_[ACTIVE|INACTIVE|PASSED|FAILED]. Updated on construction and after
1704         // methods that could change the health check state: handleElapsedTimeLocked and
1705         // tryPassHealthCheckLocked
1706         private int mHealthCheckState = HealthCheckState.INACTIVE;
1707         // Whether an explicit health check has passed.
1708         // This value in addition with mHealthCheckDurationMs determines the health check state
1709         // of the package, see #getHealthCheckStateLocked
1710         @GuardedBy("sLock")
1711         private boolean mHasPassedHealthCheck;
1712         // System uptime duration to monitor package.
1713         @GuardedBy("sLock")
1714         private long mDurationMs;
1715         // System uptime duration to check the result of an explicit health check
1716         // Initially, MAX_VALUE until we get a value from the health check service
1717         // and request health checks.
1718         // This value in addition with mHasPassedHealthCheck determines the health check state
1719         // of the package, see #getHealthCheckStateLocked
1720         @GuardedBy("sLock")
1721         private long mHealthCheckDurationMs = Long.MAX_VALUE;
1722 
MonitoredPackage(String packageName, long durationMs, long healthCheckDurationMs, boolean hasPassedHealthCheck, LongArrayQueue mitigationCalls)1723         MonitoredPackage(String packageName, long durationMs,
1724                 long healthCheckDurationMs, boolean hasPassedHealthCheck,
1725                 LongArrayQueue mitigationCalls) {
1726             mPackageName = packageName;
1727             mDurationMs = durationMs;
1728             mHealthCheckDurationMs = healthCheckDurationMs;
1729             mHasPassedHealthCheck = hasPassedHealthCheck;
1730             mMitigationCalls = mitigationCalls;
1731             updateHealthCheckStateLocked();
1732         }
1733 
1734         /** Writes the salient fields to disk using {@code out}.
1735          * @hide
1736          */
1737         @GuardedBy("sLock")
writeLocked(XmlSerializer out)1738         public void writeLocked(XmlSerializer out) throws IOException {
1739             out.startTag(null, TAG_PACKAGE);
1740             out.attribute(null, ATTR_NAME, getName());
1741             out.attribute(null, ATTR_DURATION, Long.toString(mDurationMs));
1742             out.attribute(null, ATTR_EXPLICIT_HEALTH_CHECK_DURATION,
1743                     Long.toString(mHealthCheckDurationMs));
1744             out.attribute(null, ATTR_PASSED_HEALTH_CHECK, Boolean.toString(mHasPassedHealthCheck));
1745             LongArrayQueue normalizedCalls = normalizeMitigationCalls();
1746             out.attribute(null, ATTR_MITIGATION_CALLS, longArrayQueueToString(normalizedCalls));
1747             out.endTag(null, TAG_PACKAGE);
1748         }
1749 
1750         /**
1751          * Increment package failures or resets failure count depending on the last package failure.
1752          *
1753          * @return {@code true} if failure count exceeds a threshold, {@code false} otherwise
1754          */
1755         @GuardedBy("sLock")
onFailureLocked()1756         public boolean onFailureLocked() {
1757             // Sliding window algorithm: find out if there exists a window containing failures >=
1758             // mTriggerFailureCount.
1759             final long now = mSystemClock.uptimeMillis();
1760             mFailureHistory.addLast(now);
1761             while (now - mFailureHistory.peekFirst() > mTriggerFailureDurationMs) {
1762                 // Prune values falling out of the window
1763                 mFailureHistory.removeFirst();
1764             }
1765             boolean failed = mFailureHistory.size() >= mTriggerFailureCount;
1766             if (failed) {
1767                 mFailureHistory.clear();
1768             }
1769             return failed;
1770         }
1771 
1772         /**
1773          * Notes the timestamp of a mitigation call into the observer.
1774          */
1775         @GuardedBy("sLock")
noteMitigationCallLocked()1776         public void noteMitigationCallLocked() {
1777             mMitigationCalls.addLast(mSystemClock.uptimeMillis());
1778         }
1779 
1780         /**
1781          * Prunes any mitigation calls outside of the de-escalation window, and returns the
1782          * number of calls that are in the window afterwards.
1783          *
1784          * @return the number of mitigation calls made in the de-escalation window.
1785          */
1786         @GuardedBy("sLock")
getMitigationCountLocked()1787         public int getMitigationCountLocked() {
1788             try {
1789                 final long now = mSystemClock.uptimeMillis();
1790                 while (now - mMitigationCalls.peekFirst() > DEFAULT_DEESCALATION_WINDOW_MS) {
1791                     mMitigationCalls.removeFirst();
1792                 }
1793             } catch (NoSuchElementException ignore) {
1794             }
1795 
1796             return mMitigationCalls.size();
1797         }
1798 
1799         /**
1800          * Before writing to disk, make the mitigation call timestamps relative to the current
1801          * system uptime. This is because they need to be relative to the uptime which will reset
1802          * at the next boot.
1803          *
1804          * @return a LongArrayQueue of the mitigation calls relative to the current system uptime.
1805          */
1806         @GuardedBy("sLock")
normalizeMitigationCalls()1807         public LongArrayQueue normalizeMitigationCalls() {
1808             LongArrayQueue normalized = new LongArrayQueue();
1809             final long now = mSystemClock.uptimeMillis();
1810             for (int i = 0; i < mMitigationCalls.size(); i++) {
1811                 normalized.addLast(mMitigationCalls.get(i) - now);
1812             }
1813             return normalized;
1814         }
1815 
1816         /**
1817          * Sets the initial health check duration.
1818          *
1819          * @return the new health check state
1820          */
1821         @GuardedBy("sLock")
setHealthCheckActiveLocked(long initialHealthCheckDurationMs)1822         public int setHealthCheckActiveLocked(long initialHealthCheckDurationMs) {
1823             if (initialHealthCheckDurationMs <= 0) {
1824                 Slog.wtf(TAG, "Cannot set non-positive health check duration "
1825                         + initialHealthCheckDurationMs + "ms for package " + getName()
1826                         + ". Using total duration " + mDurationMs + "ms instead");
1827                 initialHealthCheckDurationMs = mDurationMs;
1828             }
1829             if (mHealthCheckState == HealthCheckState.INACTIVE) {
1830                 // Transitions to ACTIVE
1831                 mHealthCheckDurationMs = initialHealthCheckDurationMs;
1832             }
1833             return updateHealthCheckStateLocked();
1834         }
1835 
1836         /**
1837          * Updates the monitoring durations of the package.
1838          *
1839          * @return the new health check state
1840          */
1841         @GuardedBy("sLock")
handleElapsedTimeLocked(long elapsedMs)1842         public int handleElapsedTimeLocked(long elapsedMs) {
1843             if (elapsedMs <= 0) {
1844                 Slog.w(TAG, "Cannot handle non-positive elapsed time for package " + getName());
1845                 return mHealthCheckState;
1846             }
1847             // Transitions to FAILED if now <= 0 and health check not passed
1848             mDurationMs -= elapsedMs;
1849             if (mHealthCheckState == HealthCheckState.ACTIVE) {
1850                 // We only update health check durations if we have #setHealthCheckActiveLocked
1851                 // This ensures we don't leave the INACTIVE state for an unexpected elapsed time
1852                 // Transitions to FAILED if now <= 0 and health check not passed
1853                 mHealthCheckDurationMs -= elapsedMs;
1854             }
1855             return updateHealthCheckStateLocked();
1856         }
1857 
1858         /** Explicitly update the monitoring duration of the package. */
1859         @GuardedBy("sLock")
updateHealthCheckDuration(long newDurationMs)1860         public void updateHealthCheckDuration(long newDurationMs) {
1861             mDurationMs = newDurationMs;
1862         }
1863 
1864         /**
1865          * Marks the health check as passed and transitions to {@link HealthCheckState.PASSED}
1866          * if not yet {@link HealthCheckState.FAILED}.
1867          *
1868          * @return the new {@link HealthCheckState health check state}
1869          */
1870         @GuardedBy("sLock")
1871         @HealthCheckState
tryPassHealthCheckLocked()1872         public int tryPassHealthCheckLocked() {
1873             if (mHealthCheckState != HealthCheckState.FAILED) {
1874                 // FAILED is a final state so only pass if we haven't failed
1875                 // Transition to PASSED
1876                 mHasPassedHealthCheck = true;
1877             }
1878             return updateHealthCheckStateLocked();
1879         }
1880 
1881         /** Returns the monitored package name. */
getName()1882         private String getName() {
1883             return mPackageName;
1884         }
1885 
1886         /**
1887          * Returns the current {@link HealthCheckState health check state}.
1888          */
1889         @GuardedBy("sLock")
1890         @HealthCheckState
getHealthCheckStateLocked()1891         public int getHealthCheckStateLocked() {
1892             return mHealthCheckState;
1893         }
1894 
1895         /**
1896          * Returns the shortest duration before the package should be scheduled for a prune.
1897          *
1898          * @return the duration or {@link Long#MAX_VALUE} if the package should not be scheduled
1899          */
1900         @GuardedBy("sLock")
getShortestScheduleDurationMsLocked()1901         public long getShortestScheduleDurationMsLocked() {
1902             // Consider health check duration only if #isPendingHealthChecksLocked is true
1903             return Math.min(toPositive(mDurationMs),
1904                     isPendingHealthChecksLocked()
1905                     ? toPositive(mHealthCheckDurationMs) : Long.MAX_VALUE);
1906         }
1907 
1908         /**
1909          * Returns {@code true} if the total duration left to monitor the package is less than or
1910          * equal to 0 {@code false} otherwise.
1911          */
1912         @GuardedBy("sLock")
isExpiredLocked()1913         public boolean isExpiredLocked() {
1914             return mDurationMs <= 0;
1915         }
1916 
1917         /**
1918          * Returns {@code true} if the package, {@link #getName} is expecting health check results
1919          * {@code false} otherwise.
1920          */
1921         @GuardedBy("sLock")
isPendingHealthChecksLocked()1922         public boolean isPendingHealthChecksLocked() {
1923             return mHealthCheckState == HealthCheckState.ACTIVE
1924                     || mHealthCheckState == HealthCheckState.INACTIVE;
1925         }
1926 
1927         /**
1928          * Updates the health check state based on {@link #mHasPassedHealthCheck}
1929          * and {@link #mHealthCheckDurationMs}.
1930          *
1931          * @return the new {@link HealthCheckState health check state}
1932          */
1933         @GuardedBy("sLock")
1934         @HealthCheckState
updateHealthCheckStateLocked()1935         private int updateHealthCheckStateLocked() {
1936             int oldState = mHealthCheckState;
1937             if (mHasPassedHealthCheck) {
1938                 // Set final state first to avoid ambiguity
1939                 mHealthCheckState = HealthCheckState.PASSED;
1940             } else if (mHealthCheckDurationMs <= 0 || mDurationMs <= 0) {
1941                 // Set final state first to avoid ambiguity
1942                 mHealthCheckState = HealthCheckState.FAILED;
1943             } else if (mHealthCheckDurationMs == Long.MAX_VALUE) {
1944                 mHealthCheckState = HealthCheckState.INACTIVE;
1945             } else {
1946                 mHealthCheckState = HealthCheckState.ACTIVE;
1947             }
1948 
1949             if (oldState != mHealthCheckState) {
1950                 Slog.i(TAG, "Updated health check state for package " + getName() + ": "
1951                         + toString(oldState) + " -> " + toString(mHealthCheckState));
1952             }
1953             return mHealthCheckState;
1954         }
1955 
1956         /** Returns a {@link String} representation of the current health check state. */
toString(@ealthCheckState int state)1957         private String toString(@HealthCheckState int state) {
1958             switch (state) {
1959                 case HealthCheckState.ACTIVE:
1960                     return "ACTIVE";
1961                 case HealthCheckState.INACTIVE:
1962                     return "INACTIVE";
1963                 case HealthCheckState.PASSED:
1964                     return "PASSED";
1965                 case HealthCheckState.FAILED:
1966                     return "FAILED";
1967                 default:
1968                     return "UNKNOWN";
1969             }
1970         }
1971 
1972         /** Returns {@code value} if it is greater than 0 or {@link Long#MAX_VALUE} otherwise. */
toPositive(long value)1973         private long toPositive(long value) {
1974             return value > 0 ? value : Long.MAX_VALUE;
1975         }
1976 
1977         /** Compares the equality of this object with another {@link MonitoredPackage}. */
1978         @VisibleForTesting
isEqualTo(MonitoredPackage pkg)1979         boolean isEqualTo(MonitoredPackage pkg) {
1980             return (getName().equals(pkg.getName()))
1981                     && mDurationMs == pkg.mDurationMs
1982                     && mHasPassedHealthCheck == pkg.mHasPassedHealthCheck
1983                     && mHealthCheckDurationMs == pkg.mHealthCheckDurationMs
1984                     && (mMitigationCalls.toString()).equals(pkg.mMitigationCalls.toString());
1985         }
1986     }
1987 
1988     @GuardedBy("sLock")
1989     @SuppressWarnings("GuardedBy")
saveAllObserversBootMitigationCountToMetadata(String filePath)1990     void saveAllObserversBootMitigationCountToMetadata(String filePath) {
1991         HashMap<String, Integer> bootMitigationCounts = new HashMap<>();
1992         for (int i = 0; i < mAllObservers.size(); i++) {
1993             final ObserverInternal observer = mAllObservers.valueAt(i);
1994             bootMitigationCounts.put(observer.name, observer.getBootMitigationCount());
1995         }
1996 
1997         FileOutputStream fileStream = null;
1998         ObjectOutputStream objectStream = null;
1999         try {
2000             fileStream = new FileOutputStream(new File(filePath));
2001             objectStream = new ObjectOutputStream(fileStream);
2002             objectStream.writeObject(bootMitigationCounts);
2003             objectStream.flush();
2004         } catch (Exception e) {
2005             Slog.i(TAG, "Could not save observers metadata to file: " + e);
2006             return;
2007         } finally {
2008             IoUtils.closeQuietly(objectStream);
2009             IoUtils.closeQuietly(fileStream);
2010         }
2011     }
2012 
2013     /**
2014      * Handles the thresholding logic for system server boots.
2015      */
2016     class BootThreshold {
2017 
2018         private final int mBootTriggerCount;
2019         private final long mTriggerWindow;
2020 
BootThreshold(int bootTriggerCount, long triggerWindow)2021         BootThreshold(int bootTriggerCount, long triggerWindow) {
2022             this.mBootTriggerCount = bootTriggerCount;
2023             this.mTriggerWindow = triggerWindow;
2024         }
2025 
reset()2026         public void reset() {
2027             setStart(0);
2028             setCount(0);
2029         }
2030 
getCount()2031         protected int getCount() {
2032             return CrashRecoveryProperties.rescueBootCount().orElse(0);
2033         }
2034 
setCount(int count)2035         protected void setCount(int count) {
2036             CrashRecoveryProperties.rescueBootCount(count);
2037         }
2038 
getStart()2039         public long getStart() {
2040             return CrashRecoveryProperties.rescueBootStart().orElse(0L);
2041         }
2042 
getMitigationCount()2043         public int getMitigationCount() {
2044             return CrashRecoveryProperties.bootMitigationCount().orElse(0);
2045         }
2046 
setStart(long start)2047         public void setStart(long start) {
2048             CrashRecoveryProperties.rescueBootStart(getStartTime(start));
2049         }
2050 
setMitigationStart(long start)2051         public void setMitigationStart(long start) {
2052             CrashRecoveryProperties.bootMitigationStart(getStartTime(start));
2053         }
2054 
getMitigationStart()2055         public long getMitigationStart() {
2056             return CrashRecoveryProperties.bootMitigationStart().orElse(0L);
2057         }
2058 
setMitigationCount(int count)2059         public void setMitigationCount(int count) {
2060             CrashRecoveryProperties.bootMitigationCount(count);
2061         }
2062 
constrain(long amount, long low, long high)2063         private static long constrain(long amount, long low, long high) {
2064             return amount < low ? low : (amount > high ? high : amount);
2065         }
2066 
getStartTime(long start)2067         public long getStartTime(long start) {
2068             final long now = mSystemClock.uptimeMillis();
2069             return constrain(start, 0, now);
2070         }
2071 
saveMitigationCountToMetadata()2072         public void saveMitigationCountToMetadata() {
2073             try (BufferedWriter writer = new BufferedWriter(new FileWriter(METADATA_FILE))) {
2074                 writer.write(String.valueOf(getMitigationCount()));
2075             } catch (Exception e) {
2076                 Slog.e(TAG, "Could not save metadata to file: " + e);
2077             }
2078         }
2079 
readMitigationCountFromMetadataIfNecessary()2080         public void readMitigationCountFromMetadataIfNecessary() {
2081             File bootPropsFile = new File(METADATA_FILE);
2082             if (bootPropsFile.exists()) {
2083                 try (BufferedReader reader = new BufferedReader(new FileReader(METADATA_FILE))) {
2084                     String mitigationCount = reader.readLine();
2085                     setMitigationCount(Integer.parseInt(mitigationCount));
2086                     bootPropsFile.delete();
2087                 } catch (Exception e) {
2088                     Slog.i(TAG, "Could not read metadata file: " + e);
2089                 }
2090             }
2091         }
2092 
2093 
2094         /** Increments the boot counter, and returns whether the device is bootlooping. */
2095         @GuardedBy("sLock")
incrementAndTest()2096         public boolean incrementAndTest() {
2097             readAllObserversBootMitigationCountIfNecessary(METADATA_FILE);
2098 
2099             final long now = mSystemClock.uptimeMillis();
2100             if (now - getStart() < 0) {
2101                 Slog.e(TAG, "Window was less than zero. Resetting start to current time.");
2102                 setStart(now);
2103                 setMitigationStart(now);
2104             }
2105             if (now - getMitigationStart() > DEFAULT_DEESCALATION_WINDOW_MS) {
2106                 setMitigationStart(now);
2107                 resetAllObserversBootMitigationCount();
2108             }
2109             final long window = now - getStart();
2110             if (window >= mTriggerWindow) {
2111                 setCount(1);
2112                 setStart(now);
2113                 return false;
2114             } else {
2115                 int count = getCount() + 1;
2116                 setCount(count);
2117                 EventLog.writeEvent(LOG_TAG_RESCUE_NOTE, Process.ROOT_UID, count, window);
2118                 // After a reboot (e.g. by WARM_REBOOT or mainline rollback) we apply
2119                 // mitigations without waiting for DEFAULT_BOOT_LOOP_TRIGGER_COUNT.
2120                 return (count >= mBootTriggerCount)
2121                         || (performedMitigationsDuringWindow() && count > 1);
2122             }
2123         }
2124 
2125         @GuardedBy("sLock")
performedMitigationsDuringWindow()2126         private boolean performedMitigationsDuringWindow() {
2127             for (ObserverInternal observerInternal: mAllObservers.values()) {
2128                 if (observerInternal.getBootMitigationCount() > 0) {
2129                     return true;
2130                 }
2131             }
2132             return false;
2133         }
2134 
2135         @GuardedBy("sLock")
resetAllObserversBootMitigationCount()2136         private void resetAllObserversBootMitigationCount() {
2137             for (int i = 0; i < mAllObservers.size(); i++) {
2138                 final ObserverInternal observer = mAllObservers.valueAt(i);
2139                 observer.setBootMitigationCount(0);
2140             }
2141             saveAllObserversBootMitigationCountToMetadata(METADATA_FILE);
2142         }
2143 
2144         @GuardedBy("sLock")
2145         @SuppressWarnings("GuardedBy")
readAllObserversBootMitigationCountIfNecessary(String filePath)2146         void readAllObserversBootMitigationCountIfNecessary(String filePath) {
2147             File metadataFile = new File(filePath);
2148             if (metadataFile.exists()) {
2149                 FileInputStream fileStream = null;
2150                 ObjectInputStream objectStream = null;
2151                 HashMap<String, Integer> bootMitigationCounts = null;
2152                 try {
2153                     fileStream = new FileInputStream(metadataFile);
2154                     objectStream = new ObjectInputStream(fileStream);
2155                     bootMitigationCounts =
2156                             (HashMap<String, Integer>) objectStream.readObject();
2157                 } catch (Exception e) {
2158                     Slog.i(TAG, "Could not read observer metadata file: " + e);
2159                    return;
2160                 } finally {
2161                     IoUtils.closeQuietly(objectStream);
2162                     IoUtils.closeQuietly(fileStream);
2163                 }
2164 
2165                 if (bootMitigationCounts == null || bootMitigationCounts.isEmpty()) {
2166                     Slog.i(TAG, "No observer in metadata file");
2167                     return;
2168                 }
2169                 for (int i = 0; i < mAllObservers.size(); i++) {
2170                     final ObserverInternal observer = mAllObservers.valueAt(i);
2171                     if (bootMitigationCounts.containsKey(observer.name)) {
2172                         observer.setBootMitigationCount(
2173                                 bootMitigationCounts.get(observer.name));
2174                     }
2175                 }
2176             }
2177         }
2178     }
2179 
2180     /**
2181      * Register broadcast receiver for shutdown.
2182      * We would save the observer state to persist across boots.
2183      *
2184      * @hide
2185      */
registerShutdownBroadcastReceiver()2186     public void registerShutdownBroadcastReceiver() {
2187         BroadcastReceiver shutdownEventReceiver = new BroadcastReceiver() {
2188             @Override
2189             public void onReceive(Context context, Intent intent) {
2190                 // Only write if intent is relevant to device reboot or shutdown.
2191                 String intentAction = intent.getAction();
2192                 if (ACTION_REBOOT.equals(intentAction)
2193                         || ACTION_SHUTDOWN.equals(intentAction)) {
2194                     writeNow();
2195                 }
2196             }
2197         };
2198 
2199         // Setup receiver for device reboots or shutdowns.
2200         IntentFilter filter = new IntentFilter(ACTION_REBOOT);
2201         filter.addAction(ACTION_SHUTDOWN);
2202         mContext.registerReceiverForAllUsers(shutdownEventReceiver, filter, null,
2203                 /* run on main thread */ null);
2204     }
2205 }
2206