1from __future__ import annotations 2 3import re 4from datetime import datetime, timedelta 5from dataclasses import dataclass, field 6from typing import TYPE_CHECKING, Any, Optional, Sequence 7 8if TYPE_CHECKING: 9 from lava.utils import LogFollower 10 11from lava.exceptions import MesaCIKnownIssueException 12from lava.utils.console_format import CONSOLE_LOG 13from lava.utils.constants import ( 14 KNOWN_ISSUE_R8152_MAX_CONSECUTIVE_COUNTER, 15 LOG_DEBUG_FEEDBACK_NOISE, 16 KNOWN_ISSUE_R8152_PATTERNS, 17 A6XX_GPU_RECOVERY_WATCH_PERIOD_MIN, 18 A6XX_GPU_RECOVERY_FAILURE_MESSAGE, 19 A6XX_GPU_RECOVERY_FAILURE_MAX_COUNT, 20) 21from lava.utils.log_section import LogSectionType 22 23 24def search_known_issue_patterns(patterns: Sequence[str], line: str) -> str: 25 for pattern in patterns: 26 if re.search(pattern, line): 27 return pattern 28 return "" 29 30 31@dataclass 32class LAVALogHints: 33 log_follower: LogFollower 34 r8152_issue_consecutive_counter: int = field(default=0, init=False) 35 reboot_counter: int = field(default=0, init=False) 36 a6xx_gpu_recovery_fail_counter: int = field(default=0, init=False) 37 a6xx_gpu_first_fail_time: Optional[datetime] = field(default=None, init=False) 38 39 def raise_known_issue(self, message) -> None: 40 raise MesaCIKnownIssueException( 41 "Found known issue: " 42 f"{CONSOLE_LOG['FG_MAGENTA']}" 43 f"{message}" 44 f"{CONSOLE_LOG['RESET']}" 45 ) 46 47 def detect_failure(self, new_lines: list[dict[str, Any]]): 48 for line in new_lines: 49 if line["msg"] == LOG_DEBUG_FEEDBACK_NOISE: 50 continue 51 self.detect_r8152_issue(line) 52 self.detect_forced_reboot(line) 53 self.detect_a6xx_gpu_recovery_failure(line) 54 55 def detect_r8152_issue(self, line): 56 if self.log_follower.phase in ( 57 LogSectionType.LAVA_BOOT, 58 LogSectionType.TEST_CASE, 59 ) and line["lvl"] in ("feedback", "target"): 60 if search_known_issue_patterns(KNOWN_ISSUE_R8152_PATTERNS, line["msg"]): 61 if ( 62 self.r8152_issue_consecutive_counter 63 < KNOWN_ISSUE_R8152_MAX_CONSECUTIVE_COUNTER 64 ): 65 self.r8152_issue_consecutive_counter += 1 66 return 67 68 self.raise_known_issue( 69 "Probable network issue failure encountered, retrying the job" 70 ) 71 72 # Reset the status, as the `nfs... still trying` complaint was not detected 73 self.r8152_issue_consecutive_counter = 0 74 75 def detect_forced_reboot(self, line: dict[str, Any]) -> None: 76 if ( 77 self.log_follower.phase == LogSectionType.TEST_CASE 78 and line["lvl"] == "feedback" 79 ): 80 if re.search(r"^Reboot requested", line["msg"]): 81 self.reboot_counter += 1 82 83 if self.reboot_counter > 0: 84 self.raise_known_issue( 85 "Forced reboot detected during test phase, failing the job..." 86 ) 87 88 # If the a6xx gpu repeatedly fails to recover over a short period of time, 89 # then successful recovery is unlikely so cancel the job preemptively. 90 def detect_a6xx_gpu_recovery_failure(self, line: dict[str, Any]) -> None: 91 if search_known_issue_patterns(A6XX_GPU_RECOVERY_FAILURE_MESSAGE, line["msg"]): 92 time_of_failure = datetime.fromisoformat(line["dt"]) 93 self.a6xx_gpu_recovery_fail_counter += 1 94 95 if self.a6xx_gpu_first_fail_time is None: 96 self.a6xx_gpu_first_fail_time = time_of_failure 97 98 if self.a6xx_gpu_recovery_fail_counter == A6XX_GPU_RECOVERY_FAILURE_MAX_COUNT: 99 time_since_first_fail = time_of_failure - self.a6xx_gpu_first_fail_time 100 if time_since_first_fail <= timedelta(minutes=A6XX_GPU_RECOVERY_WATCH_PERIOD_MIN): 101 self.raise_known_issue( 102 "Repeated GPU recovery failure detected: cancelling the job" 103 ) 104 else: 105 self.a6xx_gpu_first_fail_time = None 106 self.a6xx_gpu_recovery_fail_counter = 0 107