• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1from __future__ import annotations
2
3import re
4from datetime import datetime, timedelta
5from dataclasses import dataclass, field
6from typing import TYPE_CHECKING, Any, Optional, Sequence
7
8if TYPE_CHECKING:
9    from lava.utils import LogFollower
10
11from lava.exceptions import MesaCIKnownIssueException
12from lava.utils.console_format import CONSOLE_LOG
13from lava.utils.constants import (
14    KNOWN_ISSUE_R8152_MAX_CONSECUTIVE_COUNTER,
15    LOG_DEBUG_FEEDBACK_NOISE,
16    KNOWN_ISSUE_R8152_PATTERNS,
17    A6XX_GPU_RECOVERY_WATCH_PERIOD_MIN,
18    A6XX_GPU_RECOVERY_FAILURE_MESSAGE,
19    A6XX_GPU_RECOVERY_FAILURE_MAX_COUNT,
20)
21from lava.utils.log_section import LogSectionType
22
23
24def search_known_issue_patterns(patterns: Sequence[str], line: str) -> str:
25    for pattern in patterns:
26        if re.search(pattern, line):
27            return pattern
28    return ""
29
30
31@dataclass
32class LAVALogHints:
33    log_follower: LogFollower
34    r8152_issue_consecutive_counter: int = field(default=0, init=False)
35    reboot_counter: int = field(default=0, init=False)
36    a6xx_gpu_recovery_fail_counter: int = field(default=0, init=False)
37    a6xx_gpu_first_fail_time: Optional[datetime] = field(default=None, init=False)
38
39    def raise_known_issue(self, message) -> None:
40        raise MesaCIKnownIssueException(
41            "Found known issue: "
42            f"{CONSOLE_LOG['FG_MAGENTA']}"
43            f"{message}"
44            f"{CONSOLE_LOG['RESET']}"
45        )
46
47    def detect_failure(self, new_lines: list[dict[str, Any]]):
48        for line in new_lines:
49            if line["msg"] == LOG_DEBUG_FEEDBACK_NOISE:
50                continue
51            self.detect_r8152_issue(line)
52            self.detect_forced_reboot(line)
53            self.detect_a6xx_gpu_recovery_failure(line)
54
55    def detect_r8152_issue(self, line):
56        if self.log_follower.phase in (
57            LogSectionType.LAVA_BOOT,
58            LogSectionType.TEST_CASE,
59        ) and line["lvl"] in ("feedback", "target"):
60            if search_known_issue_patterns(KNOWN_ISSUE_R8152_PATTERNS, line["msg"]):
61                if (
62                    self.r8152_issue_consecutive_counter
63                    < KNOWN_ISSUE_R8152_MAX_CONSECUTIVE_COUNTER
64                ):
65                    self.r8152_issue_consecutive_counter += 1
66                    return
67
68                self.raise_known_issue(
69                    "Probable network issue failure encountered, retrying the job"
70                )
71
72        # Reset the status, as the `nfs... still trying` complaint was not detected
73        self.r8152_issue_consecutive_counter = 0
74
75    def detect_forced_reboot(self, line: dict[str, Any]) -> None:
76        if (
77            self.log_follower.phase == LogSectionType.TEST_CASE
78            and line["lvl"] == "feedback"
79        ):
80            if re.search(r"^Reboot requested", line["msg"]):
81                self.reboot_counter += 1
82
83                if self.reboot_counter > 0:
84                    self.raise_known_issue(
85                        "Forced reboot detected during test phase, failing the job..."
86                    )
87
88    # If the a6xx gpu repeatedly fails to recover over a short period of time,
89    # then successful recovery is unlikely so cancel the job preemptively.
90    def detect_a6xx_gpu_recovery_failure(self, line: dict[str, Any]) -> None:
91        if search_known_issue_patterns(A6XX_GPU_RECOVERY_FAILURE_MESSAGE, line["msg"]):
92            time_of_failure = datetime.fromisoformat(line["dt"])
93            self.a6xx_gpu_recovery_fail_counter += 1
94
95            if self.a6xx_gpu_first_fail_time is None:
96                self.a6xx_gpu_first_fail_time = time_of_failure
97
98            if self.a6xx_gpu_recovery_fail_counter == A6XX_GPU_RECOVERY_FAILURE_MAX_COUNT:
99                time_since_first_fail = time_of_failure - self.a6xx_gpu_first_fail_time
100                if time_since_first_fail <=  timedelta(minutes=A6XX_GPU_RECOVERY_WATCH_PERIOD_MIN):
101                    self.raise_known_issue(
102                        "Repeated GPU recovery failure detected: cancelling the job"
103                    )
104                else:
105                    self.a6xx_gpu_first_fail_time = None
106                    self.a6xx_gpu_recovery_fail_counter = 0
107