• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python3
2#
3# Copyright (C) 2022 Collabora Limited
4# Author: Guilherme Gallo <guilherme.gallo@collabora.com>
5#
6# SPDX-License-Identifier: MIT
7
8import xmlrpc.client
9from contextlib import nullcontext as does_not_raise
10from datetime import datetime
11from itertools import chain, repeat
12
13import pytest
14from lava.exceptions import MesaCIException, MesaCIRetryError
15from lava.lava_job_submitter import (
16    DEVICE_HANGING_TIMEOUT_SEC,
17    NUMBER_OF_RETRIES_TIMEOUT_DETECTION,
18    LAVAJob,
19    follow_job_execution,
20    retriable_follow_job,
21)
22from lava.utils import LogSectionType
23
24from .lava.helpers import (
25    generate_n_logs,
26    generate_testsuite_result,
27    jobs_logs_response,
28    mock_lava_signal,
29    mock_logs,
30    section_timeout,
31)
32
33NUMBER_OF_MAX_ATTEMPTS = NUMBER_OF_RETRIES_TIMEOUT_DETECTION + 1
34
35
36@pytest.fixture
37def mock_proxy_waiting_time(mock_proxy):
38    def update_mock_proxy(frozen_time, **kwargs):
39        wait_time = kwargs.pop("wait_time", 1)
40        proxy_mock = mock_proxy(**kwargs)
41        proxy_job_state = proxy_mock.scheduler.job_state
42        proxy_job_state.return_value = {"job_state": "Running"}
43        proxy_job_state.side_effect = frozen_time.tick(wait_time)
44
45        return proxy_mock
46
47    return update_mock_proxy
48
49
50@pytest.mark.parametrize("exception", [RuntimeError, SystemError, KeyError])
51def test_submit_and_follow_respects_exceptions(mock_sleep, mock_proxy, exception):
52    with pytest.raises(MesaCIException):
53        proxy = mock_proxy(side_effect=exception)
54        job = LAVAJob(proxy, '')
55        follow_job_execution(job)
56
57
58NETWORK_EXCEPTION = xmlrpc.client.ProtocolError("", 0, "test", {})
59XMLRPC_FAULT = xmlrpc.client.Fault(0, "test")
60
61PROXY_SCENARIOS = {
62    "simple pass case": (mock_logs(result="pass"), does_not_raise(), "pass", {}),
63    "simple fail case": (mock_logs(result="fail"), does_not_raise(), "fail", {}),
64    "simple hung case": (
65        mock_logs(
66            messages={
67                LogSectionType.TEST_CASE: [
68                    section_timeout(LogSectionType.TEST_CASE) + 1
69                ]
70                * 1000
71            },
72            result="fail",
73        ),
74        pytest.raises(MesaCIRetryError),
75        "hung",
76        {},
77    ),
78    "leftover dump from last job in boot section": (
79        (
80            mock_lava_signal(LogSectionType.LAVA_BOOT),
81            jobs_logs_response(finished=False, msg=None, result="fail"),
82        ),
83        pytest.raises(MesaCIRetryError),
84        "hung",
85        {},
86    ),
87    "boot works at last retry": (
88        mock_logs(
89            messages={
90                LogSectionType.LAVA_BOOT: [
91                    section_timeout(LogSectionType.LAVA_BOOT) + 1
92                ]
93                * NUMBER_OF_RETRIES_TIMEOUT_DETECTION
94                + [1]
95            },
96            result="pass",
97        ),
98        does_not_raise(),
99        "pass",
100        {},
101    ),
102    "test case took too long": pytest.param(
103        mock_logs(
104            messages={
105                LogSectionType.TEST_CASE: [
106                    section_timeout(LogSectionType.TEST_CASE) + 1
107                ]
108                * (NUMBER_OF_MAX_ATTEMPTS + 1)
109            },
110            result="pass",
111        ),
112        pytest.raises(MesaCIRetryError),
113        "pass",
114        {},
115    ),
116    "timed out more times than retry attempts": (
117        generate_n_logs(n=4, tick_fn=9999999),
118        pytest.raises(MesaCIRetryError),
119        "fail",
120        {},
121    ),
122    "long log case, no silence": (
123        mock_logs(
124            messages={LogSectionType.TEST_CASE: [1] * (1000)},
125            result="pass",
126        ),
127        does_not_raise(),
128        "pass",
129        {},
130    ),
131    "no retries, testsuite succeed": (
132        mock_logs(result="pass"),
133        does_not_raise(),
134        "pass",
135        {
136            "testsuite_results": [
137                generate_testsuite_result(result="pass")
138            ]
139        },
140    ),
141    "no retries, but testsuite fails": (
142        mock_logs(result="fail"),
143        does_not_raise(),
144        "fail",
145        {
146            "testsuite_results": [
147                generate_testsuite_result(result="fail")
148            ]
149        },
150    ),
151    "no retries, one testsuite fails": (
152        generate_n_logs(n=1, tick_fn=0, result="fail"),
153        does_not_raise(),
154        "fail",
155        {
156            "testsuite_results": [
157                generate_testsuite_result(result="fail"),
158                generate_testsuite_result(result="pass")
159            ]
160        },
161    ),
162    "very long silence": (
163        generate_n_logs(n=NUMBER_OF_MAX_ATTEMPTS + 1, tick_fn=100000),
164        pytest.raises(MesaCIRetryError),
165        "fail",
166        {},
167    ),
168    # If a protocol error happens, _call_proxy will retry without affecting timeouts
169    "unstable connection, ProtocolError followed by final message": (
170        (NETWORK_EXCEPTION, *list(mock_logs(result="pass"))),
171        does_not_raise(),
172        "pass",
173        {},
174    ),
175    # After an arbitrary number of retries, _call_proxy should call sys.exit
176    "unreachable case, subsequent ProtocolErrors": (
177        repeat(NETWORK_EXCEPTION),
178        pytest.raises(SystemExit),
179        "fail",
180        {},
181    ),
182    "XMLRPC Fault": ([XMLRPC_FAULT], pytest.raises(SystemExit, match="1"), False, {}),
183}
184
185
186@pytest.mark.parametrize(
187    "test_log, expectation, job_result, proxy_args",
188    PROXY_SCENARIOS.values(),
189    ids=PROXY_SCENARIOS.keys(),
190)
191def test_retriable_follow_job(
192    mock_sleep,
193    test_log,
194    expectation,
195    job_result,
196    proxy_args,
197    mock_proxy,
198):
199    with expectation:
200        proxy = mock_proxy(side_effect=test_log, **proxy_args)
201        job: LAVAJob = retriable_follow_job(proxy, "")
202        assert job_result == job.status
203
204
205WAIT_FOR_JOB_SCENARIOS = {"one log run taking (sec):": (mock_logs(result="pass"))}
206
207
208@pytest.mark.parametrize("wait_time", (DEVICE_HANGING_TIMEOUT_SEC * 2,))
209@pytest.mark.parametrize(
210    "side_effect",
211    WAIT_FOR_JOB_SCENARIOS.values(),
212    ids=WAIT_FOR_JOB_SCENARIOS.keys(),
213)
214def test_simulate_a_long_wait_to_start_a_job(
215    frozen_time,
216    wait_time,
217    side_effect,
218    mock_proxy_waiting_time,
219):
220    start_time = datetime.now()
221    job: LAVAJob = retriable_follow_job(
222        mock_proxy_waiting_time(
223            frozen_time, side_effect=side_effect, wait_time=wait_time
224        ),
225        "",
226    )
227
228    end_time = datetime.now()
229    delta_time = end_time - start_time
230
231    assert job.status == "pass"
232    assert delta_time.total_seconds() >= wait_time
233
234
235
236CORRUPTED_LOG_SCENARIOS = {
237    "too much subsequent corrupted data": (
238        [(False, "{'msg': 'Incomplete}")] * 100 + [jobs_logs_response(True)],
239        pytest.raises((MesaCIRetryError)),
240    ),
241    "one subsequent corrupted data": (
242        [(False, "{'msg': 'Incomplete}")] * 2 + [jobs_logs_response(True)],
243        does_not_raise(),
244    ),
245}
246
247
248@pytest.mark.parametrize(
249    "data_sequence, expected_exception",
250    CORRUPTED_LOG_SCENARIOS.values(),
251    ids=CORRUPTED_LOG_SCENARIOS.keys(),
252)
253def test_log_corruption(mock_sleep, data_sequence, expected_exception, mock_proxy):
254    proxy_mock = mock_proxy()
255    proxy_logs_mock = proxy_mock.scheduler.jobs.logs
256    proxy_logs_mock.side_effect = data_sequence
257    with expected_exception:
258        retriable_follow_job(proxy_mock, "")
259
260
261LAVA_RESULT_LOG_SCENARIOS = {
262    # the submitter should accept xtrace logs
263    "Bash xtrace echo with kmsg interleaving": (
264        "echo hwci: mesa: pass[  737.673352] <LAVA_SIGNAL_ENDTC mesa-ci>",
265        "pass",
266    ),
267    # the submitter should accept xtrace logs
268    "kmsg result print": (
269        "[  737.673352] hwci: mesa: pass",
270        "pass",
271    ),
272    # if the job result echo has a very bad luck, it still can be interleaved
273    # with kmsg
274    "echo output with kmsg interleaving": (
275        "hwci: mesa: pass[  737.673352] <LAVA_SIGNAL_ENDTC mesa-ci>",
276        "pass",
277    ),
278    "fail case": (
279        "hwci: mesa: fail",
280        "fail",
281    ),
282}
283
284
285@pytest.mark.parametrize(
286    "message, expectation",
287    LAVA_RESULT_LOG_SCENARIOS.values(),
288    ids=LAVA_RESULT_LOG_SCENARIOS.keys(),
289)
290def test_parse_job_result_from_log(message, expectation, mock_proxy):
291    job = LAVAJob(mock_proxy(), "")
292    job.parse_job_result_from_log([message])
293
294    assert job.status == expectation
295
296
297@pytest.mark.slow(
298    reason="Slow and sketchy test. Needs a LAVA log raw file at /tmp/log.yaml"
299)
300def test_full_yaml_log(mock_proxy, frozen_time):
301    import itertools
302    import random
303    from datetime import datetime
304
305    import yaml
306
307    def time_travel_from_log_chunk(data_chunk):
308        if not data_chunk:
309            return
310
311        first_log_time = data_chunk[0]["dt"]
312        frozen_time.move_to(first_log_time)
313        yield
314
315        last_log_time = data_chunk[-1]["dt"]
316        frozen_time.move_to(last_log_time)
317        return
318
319    def time_travel_to_test_time():
320        # Suppose that the first message timestamp of the entire LAVA job log is
321        # the same of from the job submitter execution
322        with open("/tmp/log.yaml", "r") as f:
323            first_log = f.readline()
324            first_log_time = yaml.safe_load(first_log)[0]["dt"]
325            frozen_time.move_to(first_log_time)
326
327    def load_lines() -> list:
328        with open("/tmp/log.yaml", "r") as f:
329            data = yaml.safe_load(f)
330            chain = itertools.chain(data)
331            try:
332                while True:
333                    data_chunk = [next(chain) for _ in range(random.randint(0, 50))]
334                    # Suppose that the first message timestamp is the same of
335                    # log fetch RPC call
336                    time_travel_from_log_chunk(data_chunk)
337                    yield False, []
338                    # Travel to the same datetime of the last fetched log line
339                    # in the chunk
340                    time_travel_from_log_chunk(data_chunk)
341                    yield False, data_chunk
342            except StopIteration:
343                yield True, data_chunk
344                return
345
346    proxy = mock_proxy()
347
348    def reset_logs(*args):
349        proxy.scheduler.jobs.logs.side_effect = load_lines()
350
351    proxy.scheduler.jobs.submit = reset_logs
352    with pytest.raises(MesaCIRetryError):
353        time_travel_to_test_time()
354        retriable_follow_job(proxy, "")
355