• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python3
2#
3# Copyright © 2020 Google LLC
4# SPDX-License-Identifier: MIT
5
6import argparse
7import datetime
8import math
9import os
10import re
11import sys
12
13from custom_logger import CustomLogger
14from serial_buffer import SerialBuffer
15
16ANSI_ESCAPE="\x1b[0K"
17ANSI_COLOUR="\x1b[0;36m"
18ANSI_RESET="\x1b[0m"
19SECTION_START="start"
20SECTION_END="end"
21
22class CrosServoRun:
23    def __init__(self, cpu, ec, test_timeout, logger):
24        self.cpu_ser = SerialBuffer(
25            cpu, "results/serial.txt", ": ")
26        # Merge the EC serial into the cpu_ser's line stream so that we can
27        # effectively poll on both at the same time and not have to worry about
28        self.ec_ser = SerialBuffer(
29            ec, "results/serial-ec.txt", " EC: ", line_queue=self.cpu_ser.line_queue)
30        self.test_timeout = test_timeout
31        self.logger = logger
32
33    def close(self):
34        self.ec_ser.close()
35        self.cpu_ser.close()
36
37    def ec_write(self, s):
38        print("EC> %s" % s)
39        self.ec_ser.serial.write(s.encode())
40
41    def cpu_write(self, s):
42        print("> %s" % s)
43        self.cpu_ser.serial.write(s.encode())
44
45    def print_error(self, message):
46        RED = '\033[0;31m'
47        NO_COLOR = '\033[0m'
48        print(RED + message + NO_COLOR)
49        self.logger.update_status_fail(message)
50
51    def get_rel_timestamp(self):
52        now = datetime.datetime.now(tz=datetime.UTC)
53        then_env = os.getenv("CI_JOB_STARTED_AT")
54        if not then_env:
55            return ""
56        delta = now - datetime.datetime.fromisoformat(then_env)
57        return f"[{math.floor(delta.seconds / 60):02}:{(delta.seconds % 60):02}]"
58
59    def get_cur_timestamp(self):
60        return str(int(datetime.datetime.timestamp(datetime.datetime.now())))
61
62    def print_gitlab_section(self, action, name, description, collapse=True):
63        assert action in [SECTION_START, SECTION_END]
64        out = ANSI_ESCAPE + "section_" + action + ":"
65        out += self.get_cur_timestamp() + ":"
66        out += name
67        if action == "start" and collapse:
68            out += "[collapsed=true]"
69        out += "\r" + ANSI_ESCAPE + ANSI_COLOUR
70        out += self.get_rel_timestamp() + " " + description + ANSI_RESET
71        print(out)
72
73    def boot_section(self, action):
74        self.print_gitlab_section(action, "dut_boot", "Booting hardware device", True)
75
76    def run(self):
77        # Flush any partial commands in the EC's prompt, then ask for a reboot.
78        self.ec_write("\n")
79        self.ec_write("reboot\n")
80
81        bootloader_done = False
82        self.logger.create_job_phase("boot")
83        self.boot_section(SECTION_START)
84        tftp_failures = 0
85        # This is emitted right when the bootloader pauses to check for input.
86        # Emit a ^N character to request network boot, because we don't have a
87        # direct-to-netboot firmware on cheza.
88        for line in self.cpu_ser.lines(timeout=120, phase="bootloader"):
89            if re.search("load_archive: loading locale_en.bin", line):
90                self.cpu_write("\016")
91                bootloader_done = True
92                break
93
94            # The Cheza firmware seems to occasionally get stuck looping in
95            # this error state during TFTP booting, possibly based on amount of
96            # network traffic around it, but it'll usually recover after a
97            # reboot. Currently mostly visible on google-freedreno-cheza-14.
98            if re.search("R8152: Bulk read error 0xffffffbf", line):
99                tftp_failures += 1
100                if tftp_failures >= 10:
101                    self.print_error(
102                        "Detected intermittent tftp failure, restarting run.")
103                    return 1
104
105            # If the board has a netboot firmware and we made it to booting the
106            # kernel, proceed to processing of the test run.
107            if re.search("Booting Linux", line):
108                bootloader_done = True
109                break
110
111            # The Cheza boards have issues with failing to bring up power to
112            # the system sometimes, possibly dependent on ambient temperature
113            # in the farm.
114            if re.search("POWER_GOOD not seen in time", line):
115                self.print_error(
116                    "Detected intermittent poweron failure, abandoning run.")
117                return 1
118
119        if not bootloader_done:
120            self.print_error("Failed to make it through bootloader, abandoning run.")
121            return 1
122
123        self.logger.create_job_phase("test")
124        for line in self.cpu_ser.lines(timeout=self.test_timeout, phase="test"):
125            if re.search("---. end Kernel panic", line):
126                return 1
127
128            # There are very infrequent bus errors during power management transitions
129            # on cheza, which we don't expect to be the case on future boards.
130            if re.search("Kernel panic - not syncing: Asynchronous SError Interrupt", line):
131                self.print_error(
132                    "Detected cheza power management bus error, abandoning run.")
133                return 1
134
135            # If the network device dies, it's probably not graphics's fault, just try again.
136            if re.search("NETDEV WATCHDOG", line):
137                self.print_error(
138                    "Detected network device failure, abandoning run.")
139                return 1
140
141            # These HFI response errors started appearing with the introduction
142            # of piglit runs.  CosmicPenguin says:
143            #
144            # "message ID 106 isn't a thing, so likely what happened is that we
145            # got confused when parsing the HFI queue.  If it happened on only
146            # one run, then memory corruption could be a possible clue"
147            #
148            # Given that it seems to trigger randomly near a GPU fault and then
149            # break many tests after that, just restart the whole run.
150            if re.search("a6xx_hfi_send_msg.*Unexpected message id .* on the response queue", line):
151                self.print_error(
152                    "Detected cheza power management bus error, abandoning run.")
153                return 1
154
155            if re.search("coreboot.*bootblock starting", line):
156                self.print_error(
157                    "Detected spontaneous reboot, abandoning run.")
158                return 1
159
160            if re.search("arm-smmu 5040000.iommu: TLB sync timed out -- SMMU may be deadlocked", line):
161                self.print_error("Detected cheza MMU fail, abandoning run.")
162                return 1
163
164            result = re.search(r"hwci: mesa: (\S*), exit_code: (\d+)", line)
165            if result:
166                status = result.group(1)
167                exit_code = int(result.group(2))
168
169                if status == "pass":
170                    self.logger.update_dut_job("status", "pass")
171                else:
172                    self.logger.update_status_fail("test fail")
173
174                self.logger.update_dut_job("exit_code", exit_code)
175                return exit_code
176
177        self.print_error(
178            "Reached the end of the CPU serial log without finding a result")
179        return 1
180
181
182def main():
183    parser = argparse.ArgumentParser()
184    parser.add_argument('--cpu', type=str,
185                        help='CPU Serial device', required=True)
186    parser.add_argument(
187        '--ec', type=str, help='EC Serial device', required=True)
188    parser.add_argument(
189        '--test-timeout', type=int, help='Test phase timeout (minutes)', required=True)
190    args = parser.parse_args()
191
192    logger = CustomLogger("results/job_detail.json")
193    logger.update_dut_time("start", None)
194    servo = CrosServoRun(args.cpu, args.ec, args.test_timeout * 60, logger)
195    retval = servo.run()
196
197    # power down the CPU on the device
198    servo.ec_write("power off\n")
199    logger.update_dut_time("end", None)
200    servo.close()
201
202    sys.exit(retval)
203
204
205if __name__ == '__main__':
206    main()
207