1#!/usr/bin/env python3 2# 3# Copyright © 2020 Google LLC 4# SPDX-License-Identifier: MIT 5 6import argparse 7import datetime 8import math 9import os 10import re 11import sys 12 13from custom_logger import CustomLogger 14from serial_buffer import SerialBuffer 15 16ANSI_ESCAPE="\x1b[0K" 17ANSI_COLOUR="\x1b[0;36m" 18ANSI_RESET="\x1b[0m" 19SECTION_START="start" 20SECTION_END="end" 21 22class CrosServoRun: 23 def __init__(self, cpu, ec, test_timeout, logger): 24 self.cpu_ser = SerialBuffer( 25 cpu, "results/serial.txt", ": ") 26 # Merge the EC serial into the cpu_ser's line stream so that we can 27 # effectively poll on both at the same time and not have to worry about 28 self.ec_ser = SerialBuffer( 29 ec, "results/serial-ec.txt", " EC: ", line_queue=self.cpu_ser.line_queue) 30 self.test_timeout = test_timeout 31 self.logger = logger 32 33 def close(self): 34 self.ec_ser.close() 35 self.cpu_ser.close() 36 37 def ec_write(self, s): 38 print("EC> %s" % s) 39 self.ec_ser.serial.write(s.encode()) 40 41 def cpu_write(self, s): 42 print("> %s" % s) 43 self.cpu_ser.serial.write(s.encode()) 44 45 def print_error(self, message): 46 RED = '\033[0;31m' 47 NO_COLOR = '\033[0m' 48 print(RED + message + NO_COLOR) 49 self.logger.update_status_fail(message) 50 51 def get_rel_timestamp(self): 52 now = datetime.datetime.now(tz=datetime.UTC) 53 then_env = os.getenv("CI_JOB_STARTED_AT") 54 if not then_env: 55 return "" 56 delta = now - datetime.datetime.fromisoformat(then_env) 57 return f"[{math.floor(delta.seconds / 60):02}:{(delta.seconds % 60):02}]" 58 59 def get_cur_timestamp(self): 60 return str(int(datetime.datetime.timestamp(datetime.datetime.now()))) 61 62 def print_gitlab_section(self, action, name, description, collapse=True): 63 assert action in [SECTION_START, SECTION_END] 64 out = ANSI_ESCAPE + "section_" + action + ":" 65 out += self.get_cur_timestamp() + ":" 66 out += name 67 if action == "start" and collapse: 68 out += "[collapsed=true]" 69 out += "\r" + ANSI_ESCAPE + ANSI_COLOUR 70 out += self.get_rel_timestamp() + " " + description + ANSI_RESET 71 print(out) 72 73 def boot_section(self, action): 74 self.print_gitlab_section(action, "dut_boot", "Booting hardware device", True) 75 76 def run(self): 77 # Flush any partial commands in the EC's prompt, then ask for a reboot. 78 self.ec_write("\n") 79 self.ec_write("reboot\n") 80 81 bootloader_done = False 82 self.logger.create_job_phase("boot") 83 self.boot_section(SECTION_START) 84 tftp_failures = 0 85 # This is emitted right when the bootloader pauses to check for input. 86 # Emit a ^N character to request network boot, because we don't have a 87 # direct-to-netboot firmware on cheza. 88 for line in self.cpu_ser.lines(timeout=120, phase="bootloader"): 89 if re.search("load_archive: loading locale_en.bin", line): 90 self.cpu_write("\016") 91 bootloader_done = True 92 break 93 94 # The Cheza firmware seems to occasionally get stuck looping in 95 # this error state during TFTP booting, possibly based on amount of 96 # network traffic around it, but it'll usually recover after a 97 # reboot. Currently mostly visible on google-freedreno-cheza-14. 98 if re.search("R8152: Bulk read error 0xffffffbf", line): 99 tftp_failures += 1 100 if tftp_failures >= 10: 101 self.print_error( 102 "Detected intermittent tftp failure, restarting run.") 103 return 1 104 105 # If the board has a netboot firmware and we made it to booting the 106 # kernel, proceed to processing of the test run. 107 if re.search("Booting Linux", line): 108 bootloader_done = True 109 break 110 111 # The Cheza boards have issues with failing to bring up power to 112 # the system sometimes, possibly dependent on ambient temperature 113 # in the farm. 114 if re.search("POWER_GOOD not seen in time", line): 115 self.print_error( 116 "Detected intermittent poweron failure, abandoning run.") 117 return 1 118 119 if not bootloader_done: 120 self.print_error("Failed to make it through bootloader, abandoning run.") 121 return 1 122 123 self.logger.create_job_phase("test") 124 for line in self.cpu_ser.lines(timeout=self.test_timeout, phase="test"): 125 if re.search("---. end Kernel panic", line): 126 return 1 127 128 # There are very infrequent bus errors during power management transitions 129 # on cheza, which we don't expect to be the case on future boards. 130 if re.search("Kernel panic - not syncing: Asynchronous SError Interrupt", line): 131 self.print_error( 132 "Detected cheza power management bus error, abandoning run.") 133 return 1 134 135 # If the network device dies, it's probably not graphics's fault, just try again. 136 if re.search("NETDEV WATCHDOG", line): 137 self.print_error( 138 "Detected network device failure, abandoning run.") 139 return 1 140 141 # These HFI response errors started appearing with the introduction 142 # of piglit runs. CosmicPenguin says: 143 # 144 # "message ID 106 isn't a thing, so likely what happened is that we 145 # got confused when parsing the HFI queue. If it happened on only 146 # one run, then memory corruption could be a possible clue" 147 # 148 # Given that it seems to trigger randomly near a GPU fault and then 149 # break many tests after that, just restart the whole run. 150 if re.search("a6xx_hfi_send_msg.*Unexpected message id .* on the response queue", line): 151 self.print_error( 152 "Detected cheza power management bus error, abandoning run.") 153 return 1 154 155 if re.search("coreboot.*bootblock starting", line): 156 self.print_error( 157 "Detected spontaneous reboot, abandoning run.") 158 return 1 159 160 if re.search("arm-smmu 5040000.iommu: TLB sync timed out -- SMMU may be deadlocked", line): 161 self.print_error("Detected cheza MMU fail, abandoning run.") 162 return 1 163 164 result = re.search(r"hwci: mesa: (\S*), exit_code: (\d+)", line) 165 if result: 166 status = result.group(1) 167 exit_code = int(result.group(2)) 168 169 if status == "pass": 170 self.logger.update_dut_job("status", "pass") 171 else: 172 self.logger.update_status_fail("test fail") 173 174 self.logger.update_dut_job("exit_code", exit_code) 175 return exit_code 176 177 self.print_error( 178 "Reached the end of the CPU serial log without finding a result") 179 return 1 180 181 182def main(): 183 parser = argparse.ArgumentParser() 184 parser.add_argument('--cpu', type=str, 185 help='CPU Serial device', required=True) 186 parser.add_argument( 187 '--ec', type=str, help='EC Serial device', required=True) 188 parser.add_argument( 189 '--test-timeout', type=int, help='Test phase timeout (minutes)', required=True) 190 args = parser.parse_args() 191 192 logger = CustomLogger("results/job_detail.json") 193 logger.update_dut_time("start", None) 194 servo = CrosServoRun(args.cpu, args.ec, args.test_timeout * 60, logger) 195 retval = servo.run() 196 197 # power down the CPU on the device 198 servo.ec_write("power off\n") 199 logger.update_dut_time("end", None) 200 servo.close() 201 202 sys.exit(retval) 203 204 205if __name__ == '__main__': 206 main() 207