1#!/usr/bin/env python3 2# 3# Copyright (C) 2022 Collabora Limited 4# Author: Guilherme Gallo <guilherme.gallo@collabora.com> 5# 6# SPDX-License-Identifier: MIT 7 8import xmlrpc.client 9from contextlib import nullcontext as does_not_raise 10from datetime import datetime 11from itertools import chain, repeat 12 13import pytest 14from lava.exceptions import MesaCIException, MesaCIRetryError 15from lava.lava_job_submitter import ( 16 DEVICE_HANGING_TIMEOUT_SEC, 17 NUMBER_OF_RETRIES_TIMEOUT_DETECTION, 18 LAVAJob, 19 follow_job_execution, 20 retriable_follow_job, 21) 22from lava.utils import LogSectionType 23 24from .lava.helpers import ( 25 generate_n_logs, 26 generate_testsuite_result, 27 jobs_logs_response, 28 mock_lava_signal, 29 mock_logs, 30 section_timeout, 31) 32 33NUMBER_OF_MAX_ATTEMPTS = NUMBER_OF_RETRIES_TIMEOUT_DETECTION + 1 34 35 36@pytest.fixture 37def mock_proxy_waiting_time(mock_proxy): 38 def update_mock_proxy(frozen_time, **kwargs): 39 wait_time = kwargs.pop("wait_time", 1) 40 proxy_mock = mock_proxy(**kwargs) 41 proxy_job_state = proxy_mock.scheduler.job_state 42 proxy_job_state.return_value = {"job_state": "Running"} 43 proxy_job_state.side_effect = frozen_time.tick(wait_time) 44 45 return proxy_mock 46 47 return update_mock_proxy 48 49 50@pytest.mark.parametrize("exception", [RuntimeError, SystemError, KeyError]) 51def test_submit_and_follow_respects_exceptions(mock_sleep, mock_proxy, exception): 52 with pytest.raises(MesaCIException): 53 proxy = mock_proxy(side_effect=exception) 54 job = LAVAJob(proxy, '') 55 follow_job_execution(job) 56 57 58NETWORK_EXCEPTION = xmlrpc.client.ProtocolError("", 0, "test", {}) 59XMLRPC_FAULT = xmlrpc.client.Fault(0, "test") 60 61PROXY_SCENARIOS = { 62 "simple pass case": (mock_logs(result="pass"), does_not_raise(), "pass", {}), 63 "simple fail case": (mock_logs(result="fail"), does_not_raise(), "fail", {}), 64 "simple hung case": ( 65 mock_logs( 66 messages={ 67 LogSectionType.TEST_CASE: [ 68 section_timeout(LogSectionType.TEST_CASE) + 1 69 ] 70 * 1000 71 }, 72 result="fail", 73 ), 74 pytest.raises(MesaCIRetryError), 75 "hung", 76 {}, 77 ), 78 "leftover dump from last job in boot section": ( 79 ( 80 mock_lava_signal(LogSectionType.LAVA_BOOT), 81 jobs_logs_response(finished=False, msg=None, result="fail"), 82 ), 83 pytest.raises(MesaCIRetryError), 84 "hung", 85 {}, 86 ), 87 "boot works at last retry": ( 88 mock_logs( 89 messages={ 90 LogSectionType.LAVA_BOOT: [ 91 section_timeout(LogSectionType.LAVA_BOOT) + 1 92 ] 93 * NUMBER_OF_RETRIES_TIMEOUT_DETECTION 94 + [1] 95 }, 96 result="pass", 97 ), 98 does_not_raise(), 99 "pass", 100 {}, 101 ), 102 "test case took too long": pytest.param( 103 mock_logs( 104 messages={ 105 LogSectionType.TEST_CASE: [ 106 section_timeout(LogSectionType.TEST_CASE) + 1 107 ] 108 * (NUMBER_OF_MAX_ATTEMPTS + 1) 109 }, 110 result="pass", 111 ), 112 pytest.raises(MesaCIRetryError), 113 "pass", 114 {}, 115 ), 116 "timed out more times than retry attempts": ( 117 generate_n_logs(n=4, tick_fn=9999999), 118 pytest.raises(MesaCIRetryError), 119 "fail", 120 {}, 121 ), 122 "long log case, no silence": ( 123 mock_logs( 124 messages={LogSectionType.TEST_CASE: [1] * (1000)}, 125 result="pass", 126 ), 127 does_not_raise(), 128 "pass", 129 {}, 130 ), 131 "no retries, testsuite succeed": ( 132 mock_logs(result="pass"), 133 does_not_raise(), 134 "pass", 135 { 136 "testsuite_results": [ 137 generate_testsuite_result(result="pass") 138 ] 139 }, 140 ), 141 "no retries, but testsuite fails": ( 142 mock_logs(result="fail"), 143 does_not_raise(), 144 "fail", 145 { 146 "testsuite_results": [ 147 generate_testsuite_result(result="fail") 148 ] 149 }, 150 ), 151 "no retries, one testsuite fails": ( 152 generate_n_logs(n=1, tick_fn=0, result="fail"), 153 does_not_raise(), 154 "fail", 155 { 156 "testsuite_results": [ 157 generate_testsuite_result(result="fail"), 158 generate_testsuite_result(result="pass") 159 ] 160 }, 161 ), 162 "very long silence": ( 163 generate_n_logs(n=NUMBER_OF_MAX_ATTEMPTS + 1, tick_fn=100000), 164 pytest.raises(MesaCIRetryError), 165 "fail", 166 {}, 167 ), 168 # If a protocol error happens, _call_proxy will retry without affecting timeouts 169 "unstable connection, ProtocolError followed by final message": ( 170 (NETWORK_EXCEPTION, *list(mock_logs(result="pass"))), 171 does_not_raise(), 172 "pass", 173 {}, 174 ), 175 # After an arbitrary number of retries, _call_proxy should call sys.exit 176 "unreachable case, subsequent ProtocolErrors": ( 177 repeat(NETWORK_EXCEPTION), 178 pytest.raises(SystemExit), 179 "fail", 180 {}, 181 ), 182 "XMLRPC Fault": ([XMLRPC_FAULT], pytest.raises(SystemExit, match="1"), False, {}), 183} 184 185 186@pytest.mark.parametrize( 187 "test_log, expectation, job_result, proxy_args", 188 PROXY_SCENARIOS.values(), 189 ids=PROXY_SCENARIOS.keys(), 190) 191def test_retriable_follow_job( 192 mock_sleep, 193 test_log, 194 expectation, 195 job_result, 196 proxy_args, 197 mock_proxy, 198): 199 with expectation: 200 proxy = mock_proxy(side_effect=test_log, **proxy_args) 201 job: LAVAJob = retriable_follow_job(proxy, "") 202 assert job_result == job.status 203 204 205WAIT_FOR_JOB_SCENARIOS = {"one log run taking (sec):": (mock_logs(result="pass"))} 206 207 208@pytest.mark.parametrize("wait_time", (DEVICE_HANGING_TIMEOUT_SEC * 2,)) 209@pytest.mark.parametrize( 210 "side_effect", 211 WAIT_FOR_JOB_SCENARIOS.values(), 212 ids=WAIT_FOR_JOB_SCENARIOS.keys(), 213) 214def test_simulate_a_long_wait_to_start_a_job( 215 frozen_time, 216 wait_time, 217 side_effect, 218 mock_proxy_waiting_time, 219): 220 start_time = datetime.now() 221 job: LAVAJob = retriable_follow_job( 222 mock_proxy_waiting_time( 223 frozen_time, side_effect=side_effect, wait_time=wait_time 224 ), 225 "", 226 ) 227 228 end_time = datetime.now() 229 delta_time = end_time - start_time 230 231 assert job.status == "pass" 232 assert delta_time.total_seconds() >= wait_time 233 234 235 236CORRUPTED_LOG_SCENARIOS = { 237 "too much subsequent corrupted data": ( 238 [(False, "{'msg': 'Incomplete}")] * 100 + [jobs_logs_response(True)], 239 pytest.raises((MesaCIRetryError)), 240 ), 241 "one subsequent corrupted data": ( 242 [(False, "{'msg': 'Incomplete}")] * 2 + [jobs_logs_response(True)], 243 does_not_raise(), 244 ), 245} 246 247 248@pytest.mark.parametrize( 249 "data_sequence, expected_exception", 250 CORRUPTED_LOG_SCENARIOS.values(), 251 ids=CORRUPTED_LOG_SCENARIOS.keys(), 252) 253def test_log_corruption(mock_sleep, data_sequence, expected_exception, mock_proxy): 254 proxy_mock = mock_proxy() 255 proxy_logs_mock = proxy_mock.scheduler.jobs.logs 256 proxy_logs_mock.side_effect = data_sequence 257 with expected_exception: 258 retriable_follow_job(proxy_mock, "") 259 260 261LAVA_RESULT_LOG_SCENARIOS = { 262 # the submitter should accept xtrace logs 263 "Bash xtrace echo with kmsg interleaving": ( 264 "echo hwci: mesa: pass[ 737.673352] <LAVA_SIGNAL_ENDTC mesa-ci>", 265 "pass", 266 ), 267 # the submitter should accept xtrace logs 268 "kmsg result print": ( 269 "[ 737.673352] hwci: mesa: pass", 270 "pass", 271 ), 272 # if the job result echo has a very bad luck, it still can be interleaved 273 # with kmsg 274 "echo output with kmsg interleaving": ( 275 "hwci: mesa: pass[ 737.673352] <LAVA_SIGNAL_ENDTC mesa-ci>", 276 "pass", 277 ), 278 "fail case": ( 279 "hwci: mesa: fail", 280 "fail", 281 ), 282} 283 284 285@pytest.mark.parametrize( 286 "message, expectation", 287 LAVA_RESULT_LOG_SCENARIOS.values(), 288 ids=LAVA_RESULT_LOG_SCENARIOS.keys(), 289) 290def test_parse_job_result_from_log(message, expectation, mock_proxy): 291 job = LAVAJob(mock_proxy(), "") 292 job.parse_job_result_from_log([message]) 293 294 assert job.status == expectation 295 296 297@pytest.mark.slow( 298 reason="Slow and sketchy test. Needs a LAVA log raw file at /tmp/log.yaml" 299) 300def test_full_yaml_log(mock_proxy, frozen_time): 301 import itertools 302 import random 303 from datetime import datetime 304 305 import yaml 306 307 def time_travel_from_log_chunk(data_chunk): 308 if not data_chunk: 309 return 310 311 first_log_time = data_chunk[0]["dt"] 312 frozen_time.move_to(first_log_time) 313 yield 314 315 last_log_time = data_chunk[-1]["dt"] 316 frozen_time.move_to(last_log_time) 317 return 318 319 def time_travel_to_test_time(): 320 # Suppose that the first message timestamp of the entire LAVA job log is 321 # the same of from the job submitter execution 322 with open("/tmp/log.yaml", "r") as f: 323 first_log = f.readline() 324 first_log_time = yaml.safe_load(first_log)[0]["dt"] 325 frozen_time.move_to(first_log_time) 326 327 def load_lines() -> list: 328 with open("/tmp/log.yaml", "r") as f: 329 data = yaml.safe_load(f) 330 chain = itertools.chain(data) 331 try: 332 while True: 333 data_chunk = [next(chain) for _ in range(random.randint(0, 50))] 334 # Suppose that the first message timestamp is the same of 335 # log fetch RPC call 336 time_travel_from_log_chunk(data_chunk) 337 yield False, [] 338 # Travel to the same datetime of the last fetched log line 339 # in the chunk 340 time_travel_from_log_chunk(data_chunk) 341 yield False, data_chunk 342 except StopIteration: 343 yield True, data_chunk 344 return 345 346 proxy = mock_proxy() 347 348 def reset_logs(*args): 349 proxy.scheduler.jobs.logs.side_effect = load_lines() 350 351 proxy.scheduler.jobs.submit = reset_logs 352 with pytest.raises(MesaCIRetryError): 353 time_travel_to_test_time() 354 retriable_follow_job(proxy, "") 355