1 /* 2 * Copyright (C) 2019 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 package com.android.tradefed.retry; 17 18 import com.android.tradefed.config.IConfiguration; 19 import com.android.tradefed.config.IConfigurationReceiver; 20 import com.android.tradefed.config.Option; 21 import com.android.tradefed.device.DeviceNotAvailableException; 22 import com.android.tradefed.device.ITestDevice; 23 import com.android.tradefed.device.StubDevice; 24 import com.android.tradefed.device.internal.DeviceResetHandler; 25 import com.android.tradefed.device.internal.DeviceSnapshotHandler; 26 import com.android.tradefed.invoker.IInvocationContext; 27 import com.android.tradefed.invoker.InvocationContext; 28 import com.android.tradefed.invoker.TestInformation; 29 import com.android.tradefed.invoker.logger.CurrentInvocation; 30 import com.android.tradefed.invoker.logger.CurrentInvocation.IsolationGrade; 31 import com.android.tradefed.invoker.logger.InvocationMetricLogger; 32 import com.android.tradefed.invoker.logger.InvocationMetricLogger.InvocationMetricKey; 33 import com.android.tradefed.invoker.tracing.CloseableTraceScope; 34 import com.android.tradefed.log.LogUtil.CLog; 35 import com.android.tradefed.result.TestRunResult; 36 import com.android.tradefed.result.error.DeviceErrorIdentifier; 37 import com.android.tradefed.testtype.IRemoteTest; 38 import com.android.tradefed.testtype.ITestFileFilterReceiver; 39 import com.android.tradefed.testtype.ITestFilterReceiver; 40 import com.android.tradefed.testtype.ITestInformationReceiver; 41 import com.android.tradefed.testtype.SubprocessTfLauncher; 42 import com.android.tradefed.testtype.retry.IAutoRetriableTest; 43 import com.android.tradefed.testtype.suite.ModuleDefinition; 44 import com.android.tradefed.testtype.suite.SuiteTestFilter; 45 46 import com.google.common.annotations.VisibleForTesting; 47 48 import java.util.ArrayList; 49 import java.util.LinkedHashSet; 50 import java.util.List; 51 import java.util.Set; 52 import java.util.stream.Collectors; 53 54 /** 55 * Base implementation of {@link IRetryDecision}. Base implementation only take local signals into 56 * account. 57 */ 58 public class BaseRetryDecision 59 implements IRetryDecision, IConfigurationReceiver, ITestInformationReceiver { 60 61 @Option( 62 name = "reboot-at-last-retry", 63 description = "Reboot the device at the last retry attempt." 64 ) 65 private boolean mRebootAtLastRetry = false; 66 67 @Option( 68 name = "retry-isolation-grade", 69 description = "Control the isolation level that should be attempted between retries." 70 ) 71 private IsolationGrade mRetryIsolationGrade = IsolationGrade.NOT_ISOLATED; 72 73 @Option( 74 name = "max-testrun-run-count", 75 description = 76 "If the IRemoteTest can have its modules run multiple times, " 77 + "the max number of runs for each test run (module). " 78 + "This is different from max-testcase-run-count which " 79 + "is for each test case. For example, if the testcase " 80 + "run count is 1 and the testrun run count is 3, we " 81 + "will run the module up to 3 times so as to execute " 82 + "each test case once. Format is " 83 + "[<module id>:]<run count> . If module is " 84 + "unspecified, it applies to all modules. Default is " 85 + "to use the value of max-testcase-run-count.") 86 private Set<String> mTestRunAttempts = new LinkedHashSet<>(); 87 88 @Option( 89 name = "max-testcase-run-count", 90 description = 91 "If the IRemoteTest can have its testcases run multiple times, " 92 + "the max number of runs for each testcase. Format is " 93 + "[<module id>:]<run count> . If module is " 94 + "unspecified, it applies to all modules. " 95 + "Default is 1 attempt.") 96 private Set<String> mTestCaseAttempts = new LinkedHashSet<>(); 97 98 @Option( 99 name = "retry-strategy", 100 description = 101 "The retry strategy to be used when re-running some tests with " 102 + "--max-testcase-run-count" 103 ) 104 private RetryStrategy mRetryStrategy = RetryStrategy.NO_RETRY; 105 106 @Option( 107 name = "skip-retry-in-presubmit", 108 description = "Skip retry attempts specifically in presubmit builds") 109 private boolean mSkipRetryInPresubmit = false; 110 111 @Option( 112 name = "auto-retry", 113 description = 114 "Whether or not to enable the new auto-retry. This is a feature flag for testing." 115 ) 116 private boolean mEnableAutoRetry = true; 117 118 @Option( 119 name = "skip-retrying-list", 120 description = 121 "If a test in the list, skip retrying it. The format is the same as the " 122 + "SuiteTestFilter.") 123 private Set<String> mSkipRetryingSet = new LinkedHashSet<>(); 124 125 @Deprecated 126 @Option( 127 name = "updated-retry-reporting", 128 description = "Feature flag to use the updated retry reporting strategy.") 129 private boolean mUpdatedReporting = true; 130 131 @Deprecated 132 @SuppressWarnings("unused") 133 @Option( 134 name = "updated-filtering", 135 description = "Feature flag to use the updated filtering logic.") 136 private boolean mUpdatedFiltering = true; 137 138 @Deprecated 139 @SuppressWarnings("unused") 140 @Option( 141 name = "module-preparation-retry", 142 description = "Whether or not to retry any module-level target preparation errors." + 143 "This flag is for feature testing, and eventualy it's all controlled under " + 144 "retry strategy." 145 ) 146 private boolean mModulePreparationRetry = false; 147 148 @Option( 149 name = "use-snapshot-for-reset", 150 description = "Feature flag to use snapshot/restore instead of powerwash.") 151 private boolean mUseSnapshotForReset = false; 152 153 private IInvocationContext mContext; 154 private IConfiguration mConfiguration; 155 private TestInformation mTestInformation; 156 157 private IRemoteTest mCurrentlyConsideredTest; 158 private RetryStatsHelper mStatistics; 159 private RetryTracker mRetryTracker; 160 private ExcludeFilterManager mExcludeManager; 161 private RetryCountParser mRetryCountParser; 162 163 /** Constructor for the retry decision */ BaseRetryDecision()164 public BaseRetryDecision() {} 165 166 @Override isAutoRetryEnabled()167 public boolean isAutoRetryEnabled() { 168 return mEnableAutoRetry; 169 } 170 171 @Override getRetryStrategy()172 public RetryStrategy getRetryStrategy() { 173 return mRetryStrategy; 174 } 175 176 @Override rebootAtLastAttempt()177 public boolean rebootAtLastAttempt() { 178 return mRebootAtLastRetry; 179 } 180 getRetryCountParser()181 private RetryCountParser getRetryCountParser() { 182 if (mRetryCountParser == null) { 183 mRetryCountParser = new RetryCountParser(mTestCaseAttempts, mTestRunAttempts); 184 } 185 return mRetryCountParser; 186 } 187 188 @Override getMaxTestRunAttempts()189 public int getMaxTestRunAttempts() { 190 return getMaxTestRunAttempts(null); 191 } 192 193 @Override getMaxTestRunAttempts(ModuleDefinition module)194 public int getMaxTestRunAttempts(ModuleDefinition module) { 195 return getRetryCountParser().getMaxTestRunAttempts(module); 196 } 197 198 @Override getMaxTestCaseAttempts()199 public int getMaxTestCaseAttempts() { 200 return getRetryCountParser().getMaxTestCaseAttempts(null); 201 } 202 203 @Override getMaxTestCaseAttempts(ModuleDefinition module)204 public int getMaxTestCaseAttempts(ModuleDefinition module) { 205 return getRetryCountParser().getMaxTestCaseAttempts(module); 206 } 207 208 @Override getCommandLineArgs()209 public List<String> getCommandLineArgs() { 210 List<String> args = new ArrayList<>(); 211 args.addAll(getRetryCountParser().getCommandLineArgs()); 212 args.addAll(List.of("--retry-strategy", mRetryStrategy.toString())); 213 if (mRebootAtLastRetry) { 214 args.add("--reboot-at-last-retry"); 215 } 216 args.addAll(List.of("--retry-isolation-grade", mRetryIsolationGrade.toString())); 217 for (String filterEntry : mSkipRetryingSet) { 218 args.add("--skip-retrying-list"); 219 args.add(filterEntry); 220 } 221 if (mSkipRetryInPresubmit) { 222 args.add("--skip-retry-in-presubmit"); 223 } 224 return args; 225 } 226 227 @Override addToSkipRetryList(String filterEntry)228 public void addToSkipRetryList(String filterEntry) { 229 mSkipRetryingSet.add(filterEntry); 230 } 231 232 @Override shouldRetryPreparation( ModuleDefinition module, int attempt, int maxAttempt)233 public RetryPreparationDecision shouldRetryPreparation( 234 ModuleDefinition module, 235 int attempt, 236 int maxAttempt) { 237 RetryPreparationDecision decision = new RetryPreparationDecision(false, true); 238 switch (mRetryStrategy) { 239 case NO_RETRY: 240 // Currently, do not retry if RetryStrategy is NO_RETRY. 241 return decision; 242 default: 243 // Continue the logic for retry the failures. 244 break; 245 } 246 if (attempt == maxAttempt) { 247 // No need to retry if it reaches the maximum retry count. 248 return decision; 249 } 250 if (mSkipRetryInPresubmit && InvocationContext.isPresubmit(mContext)) { 251 CLog.d("Skipping retry due to --skip-retry-in-presubmit"); 252 return decision; 253 } 254 255 // Resetting the device only happends when FULLY_ISOLATED is set, and that cleans up the 256 // device to pure state and re-run suite-level or module-level setup. Besides, it doesn't 257 // need to retry module for reboot isolation. 258 if (!IsolationGrade.FULLY_ISOLATED.equals(mRetryIsolationGrade)) { 259 CLog.i("Do not proceed on module retry because it's not set FULLY_ISOLATED."); 260 return decision; 261 } 262 263 try { 264 recoverStateOfDevices(getDevices(), attempt, module); 265 } catch (DeviceNotAvailableException e) { 266 // Retried failed, set the exception and return the decision. 267 decision = new RetryPreparationDecision(true, false); 268 decision.setPreviousException(e.getCause()); 269 return decision; 270 } 271 // Retried successfully, no exception will be caught, return the decision. 272 decision = new RetryPreparationDecision(false, false); 273 decision.setPreviousException(null); 274 return decision; 275 } 276 277 @Override setInvocationContext(IInvocationContext context)278 public void setInvocationContext(IInvocationContext context) { 279 mContext = context; 280 } 281 282 @Override setConfiguration(IConfiguration configuration)283 public void setConfiguration(IConfiguration configuration) { 284 mConfiguration = configuration; 285 } 286 287 @Override setTestInformation(TestInformation testInformation)288 public void setTestInformation(TestInformation testInformation) { 289 mTestInformation = testInformation; 290 } 291 292 @Override getTestInformation()293 public TestInformation getTestInformation() { 294 return mTestInformation; 295 } 296 297 @Override shouldRetry( IRemoteTest test, int attemptJustExecuted, List<TestRunResult> previousResults)298 public boolean shouldRetry( 299 IRemoteTest test, int attemptJustExecuted, List<TestRunResult> previousResults) 300 throws DeviceNotAvailableException { 301 return shouldRetry(test, null, attemptJustExecuted, previousResults, null); 302 } 303 304 @Override shouldRetry( IRemoteTest test, ModuleDefinition module, int attemptJustExecuted, List<TestRunResult> previousResults, DeviceNotAvailableException dnae)305 public boolean shouldRetry( 306 IRemoteTest test, 307 ModuleDefinition module, 308 int attemptJustExecuted, 309 List<TestRunResult> previousResults, 310 DeviceNotAvailableException dnae) 311 throws DeviceNotAvailableException { 312 // Keep track of some results for the test in progress for statistics purpose. 313 if (test != mCurrentlyConsideredTest) { 314 mCurrentlyConsideredTest = test; 315 mStatistics = new RetryStatsHelper(); 316 mRetryTracker = new RetryTracker(getMaxTestCaseAttempts(module)); 317 mExcludeManager = new ExcludeFilterManager(test); 318 } 319 320 if (mSkipRetryInPresubmit && InvocationContext.isPresubmit(mContext)) { 321 CLog.d("Skipping retry due to --skip-retry-in-presubmit"); 322 return false; 323 } 324 325 boolean isAlreadyRecovered = false; 326 if (dnae != null) { 327 if (!module.shouldRecoverVirtualDevice()) { 328 throw dnae; 329 } 330 recoverStateOfDevices(getDevices(), attemptJustExecuted, module); 331 isAlreadyRecovered = true; 332 // Add metrics towards device is recovered by device reset. 333 if (IsolationGrade.FULLY_ISOLATED.equals(mRetryIsolationGrade)) { 334 InvocationMetricLogger.addInvocationMetrics( 335 InvocationMetricLogger.InvocationMetricKey 336 .DEVICE_RECOVERED_FROM_DEVICE_RESET, 337 1); 338 } 339 } 340 341 // Return early for strategies other than RETRY_ANY_FAILURE. 342 switch (mRetryStrategy) { 343 case NO_RETRY: 344 // Return directly if we are not considering retry at all. 345 return false; 346 case ITERATIONS: 347 // Still support isolating the iterations if that's configured 348 if (!isAlreadyRecovered) { 349 recoverStateOfDevices(getDevices(), attemptJustExecuted, module); 350 } 351 // For iterations, retry directly, we have nothing to setup 352 return true; 353 case RERUN_UNTIL_FAILURE: 354 // For retrying until failure, if any failures occurred, skip retry. 355 return !hasAnyFailures(previousResults); 356 default: 357 // Continue the logic for retry the failures. 358 break; 359 } 360 361 if (!hasAnyFailures(previousResults)) { 362 CLog.d("No test run or test case failures. No need to retry."); 363 mStatistics.addResultsFromRun(previousResults, 0L, attemptJustExecuted); 364 return false; 365 } 366 367 Set<String> moduleSkipList = new LinkedHashSet<String>(); 368 if (module != null && isInSkipList(module, moduleSkipList)) { 369 CLog.d("Skip retrying known failure test of %s", module.getId()); 370 InvocationMetricLogger.addInvocationMetrics( 371 InvocationMetricKey.RETRY_SKIPPED_ALL_FILTERED_COUNT, 1); 372 return false; 373 } 374 if (module == null) { 375 // If it's not a module, carry all filters 376 moduleSkipList.addAll(mSkipRetryingSet); 377 } 378 379 boolean shouldRetry = false; 380 long retryStartTime = System.currentTimeMillis(); 381 if (test instanceof ITestFilterReceiver || test instanceof ITestFileFilterReceiver) { 382 // Record the attempt for the previous failed tests. 383 mRetryTracker.recordTestRun(previousResults, attemptJustExecuted, moduleSkipList); 384 385 // Setup exclude filters. 386 mExcludeManager.resetDefaultFilters(); 387 mExcludeManager.addExcludeFilters(mRetryTracker.getExcludedTests()); 388 389 // Check if we should retry. 390 shouldRetry = mRetryTracker.shouldRetry(); 391 392 if (shouldRetry && !isAlreadyRecovered) { 393 // In case of retry, go through the recovery routine 394 recoverStateOfDevices(getDevices(), attemptJustExecuted, module); 395 } 396 } else if (test instanceof IAutoRetriableTest) { 397 // Routine for IRemoteTest that don't support filters but still needs retry. 398 IAutoRetriableTest autoRetryTest = (IAutoRetriableTest) test; 399 shouldRetry = 400 autoRetryTest.shouldRetry(attemptJustExecuted, previousResults, moduleSkipList); 401 if (shouldRetry && !isAlreadyRecovered) { 402 recoverStateOfDevices(getDevices(), attemptJustExecuted, module); 403 } 404 } else { 405 CLog.d( 406 "%s does not implement ITestFilterReceiver or ITestFileFilterReceiver or " 407 + "IAutoRetriableTest, thus cannot work with auto-retry.", 408 test); 409 return false; 410 } 411 long retryCost = System.currentTimeMillis() - retryStartTime; 412 if (!shouldRetry) { 413 retryCost = 0L; 414 } 415 mStatistics.addResultsFromRun(previousResults, retryCost, attemptJustExecuted); 416 return shouldRetry; 417 } 418 419 @Override addLastAttempt(List<TestRunResult> lastResults)420 public void addLastAttempt(List<TestRunResult> lastResults) { 421 mStatistics.addResultsFromRun(lastResults); 422 } 423 424 @Override getRetryStatistics()425 public RetryStatistics getRetryStatistics() { 426 if (mStatistics == null) { 427 return new RetryStatsHelper().calculateStatistics(); 428 } 429 return mStatistics.calculateStatistics(); 430 } 431 432 /** Returns true if we should use the updated reporting. */ 433 @Override useUpdatedReporting()434 public boolean useUpdatedReporting() { 435 return mUpdatedReporting; 436 } 437 438 @VisibleForTesting getIsolationGrade()439 public IsolationGrade getIsolationGrade() { 440 return mRetryIsolationGrade; 441 } 442 getSkipRetrySet()443 public Set<String> getSkipRetrySet() { 444 return mSkipRetryingSet; 445 } 446 447 /** 448 * Skips retry if the module is fully skipped and populate module skip list if only some tests 449 * need to stop retrying. 450 */ isInSkipList(ModuleDefinition module, Set<String> moduleSkipList)451 private boolean isInSkipList(ModuleDefinition module, Set<String> moduleSkipList) { 452 String moduleId = module.getId(); 453 if (moduleId == null) { 454 return false; 455 } 456 SuiteTestFilter moduleIdFilter = SuiteTestFilter.createFrom(moduleId); 457 String abi = moduleIdFilter.getAbi(); 458 String name = moduleIdFilter.getName(); 459 460 boolean shouldSkip = false; 461 for (String skipTest : mSkipRetryingSet) { 462 // Only handle module level exclusion 463 SuiteTestFilter skipRetryingFilter = SuiteTestFilter.createFrom(skipTest); 464 String skipAbi = skipRetryingFilter.getAbi(); 465 String skipName = skipRetryingFilter.getName(); 466 String skipTestName = skipRetryingFilter.getTest(); 467 if (abi != null 468 && name != null 469 && skipName != null 470 && name.equals(skipName)) { 471 if (skipAbi != null && !abi.equals(skipAbi)) { 472 // If the skip has an explicit abi that doesn't match 473 // module, don't skip. If not specified, consider all modules 474 continue; 475 } 476 if (skipTestName == null) { 477 InvocationMetricLogger.addInvocationMetrics( 478 InvocationMetricKey.RETRY_MODULE_SKIPPED_COUNT, 1); 479 shouldSkip = true; 480 } else { 481 moduleSkipList.add(skipTestName); 482 } 483 } 484 } 485 return shouldSkip; 486 } 487 488 /** Returns true if there are any failures in the previous results. */ hasAnyFailures(List<TestRunResult> previousResults)489 private boolean hasAnyFailures(List<TestRunResult> previousResults) { 490 for (TestRunResult run : previousResults) { 491 if (run != null && (run.isRunFailure() || run.hasFailedTests())) { 492 return true; 493 } 494 } 495 return false; 496 } 497 498 /** Returns all the non-stub device associated with the {@link IRemoteTest}. */ getDevices()499 private List<ITestDevice> getDevices() { 500 List<ITestDevice> listDevices = new ArrayList<>(mContext.getDevices()); 501 // Return all the non-stub device (the one we can actually do some recovery against) 502 return listDevices 503 .stream() 504 .filter(d -> !(d.getIDevice() instanceof StubDevice)) 505 .collect(Collectors.toList()); 506 } 507 508 /** Recovery attempt on the device to get it a better state before next retry. */ recoverStateOfDevices( List<ITestDevice> devices, int lastAttempt, ModuleDefinition module)509 private void recoverStateOfDevices( 510 List<ITestDevice> devices, int lastAttempt, ModuleDefinition module) 511 throws DeviceNotAvailableException { 512 if (IsolationGrade.REBOOT_ISOLATED.equals(mRetryIsolationGrade)) { 513 long start = System.currentTimeMillis(); 514 try (CloseableTraceScope ignored = new CloseableTraceScope("reboot_isolation")) { 515 for (ITestDevice device : devices) { 516 device.reboot(); 517 } 518 CurrentInvocation.setModuleIsolation(IsolationGrade.REBOOT_ISOLATED); 519 CurrentInvocation.setRunIsolation(IsolationGrade.REBOOT_ISOLATED); 520 } finally { 521 InvocationMetricLogger.addInvocationPairMetrics( 522 InvocationMetricKey.REBOOT_RETRY_ISOLATION_PAIR, 523 start, System.currentTimeMillis()); 524 } 525 } else if (IsolationGrade.FULLY_ISOLATED.equals(mRetryIsolationGrade)) { 526 resetIsolation(module, devices); 527 } else if (lastAttempt >= (getMaxTestCaseAttempts(module) - 2)) { 528 // Reset only works for suite right now 529 if (mRebootAtLastRetry) { 530 for (ITestDevice device : devices) { 531 device.reboot(); 532 } 533 CurrentInvocation.setModuleIsolation(IsolationGrade.REBOOT_ISOLATED); 534 CurrentInvocation.setRunIsolation(IsolationGrade.REBOOT_ISOLATED); 535 } 536 } 537 } 538 resetIsolation(ModuleDefinition module, List<ITestDevice> devices)539 private void resetIsolation(ModuleDefinition module, List<ITestDevice> devices) 540 throws DeviceNotAvailableException { 541 long start = System.currentTimeMillis(); 542 try (CloseableTraceScope ignored = new CloseableTraceScope("reset_isolation")) { 543 isolateRetry(devices); 544 CLog.d( 545 "Current host properties being erased by reset: %s", 546 mTestInformation.properties().getAll()); 547 mTestInformation.properties().clear(); 548 // Rerun suite level preparer if we are inside a subprocess 549 reSetupModule( 550 module, 551 (mConfiguration 552 .getCommandOptions() 553 .getInvocationData() 554 .containsKey(SubprocessTfLauncher.SUBPROCESS_TAG_NAME) 555 && !mUseSnapshotForReset)); 556 } finally { 557 InvocationMetricLogger.addInvocationPairMetrics( 558 InvocationMetricKey.RESET_RETRY_ISOLATION_PAIR, 559 start, System.currentTimeMillis()); 560 } 561 } 562 563 @VisibleForTesting isolateRetry(List<ITestDevice> devices)564 protected void isolateRetry(List<ITestDevice> devices) throws DeviceNotAvailableException { 565 if (!mUseSnapshotForReset) { 566 DeviceResetHandler handler = new DeviceResetHandler(mContext); 567 for (ITestDevice device : devices) { 568 boolean resetSuccess = handler.resetDevice(device); 569 if (!resetSuccess) { 570 throw new DeviceNotAvailableException( 571 String.format("Failed to reset device: %s", device.getSerialNumber()), 572 device.getSerialNumber(), 573 DeviceErrorIdentifier.DEVICE_FAILED_TO_RESET); 574 } 575 } 576 } else { 577 for (ITestDevice device : devices) { 578 new DeviceSnapshotHandler() 579 .restoreSnapshotDevice(device, mContext.getInvocationId()); 580 } 581 } 582 } 583 reSetupModule(ModuleDefinition module, boolean includeSuitePreparers)584 private void reSetupModule(ModuleDefinition module, boolean includeSuitePreparers) 585 throws DeviceNotAvailableException { 586 if (module == null) { 587 return; 588 } 589 if (module.getId() != null) { 590 InvocationMetricLogger.addInvocationMetrics( 591 InvocationMetricKey.DEVICE_RESET_MODULES, module.getId()); 592 } 593 // Run all preparers including optionally suite level ones. 594 Throwable preparationException = 595 module.runPreparation(includeSuitePreparers); 596 if (preparationException != null) { 597 CLog.e(preparationException); 598 throw new DeviceNotAvailableException( 599 String.format( 600 "Failed to reset devices before retry: %s", 601 preparationException.toString()), 602 preparationException, 603 "serial", 604 DeviceErrorIdentifier.DEVICE_FAILED_TO_RESET); 605 } 606 } 607 }