• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python3
2
3"""This script runs cuda-memcheck on the specified unit test. Each test case
4is run in its isolated process with a timeout so that:
51) different test cases won't influence each other, and
62) in case of hang, the script would still finish in a finite amount of time.
7The output will be written to a log file result.log
8
9Example usage:
10    python run_cuda_memcheck.py ../test_torch.py 600
11
12Note that running cuda-memcheck could be very slow.
13"""
14
15import argparse
16import asyncio
17import multiprocessing
18import os
19import subprocess
20import sys
21
22import cuda_memcheck_common as cmc
23import tqdm
24
25import torch
26
27
28ALL_TESTS = []
29GPUS = torch.cuda.device_count()
30
31# parse arguments
32parser = argparse.ArgumentParser(description="Run isolated cuda-memcheck on unit tests")
33parser.add_argument(
34    "filename", help="the python file for a test, such as test_torch.py"
35)
36parser.add_argument(
37    "timeout",
38    type=int,
39    help="kill the test if it does not terminate in a certain amount of seconds",
40)
41parser.add_argument(
42    "--strict",
43    action="store_true",
44    help="Whether to show cublas/cudnn errors. These errors are ignored by default because"
45    "cublas/cudnn does not run error-free under cuda-memcheck, and ignoring these errors",
46)
47parser.add_argument(
48    "--nproc",
49    type=int,
50    default=multiprocessing.cpu_count(),
51    help="Number of processes running tests, default to number of cores in the system",
52)
53parser.add_argument(
54    "--gpus",
55    default="all",
56    help='GPU assignments for each process, it could be "all", or : separated list like "1,2:3,4:5,6"',
57)
58parser.add_argument(
59    "--ci",
60    action="store_true",
61    help="Whether this script is executed in CI. When executed inside a CI, this script fails when "
62    "an error is detected. Also, it will not show tqdm progress bar, but directly print the error"
63    "to stdout instead.",
64)
65parser.add_argument("--nohang", action="store_true", help="Treat timeout as success")
66parser.add_argument("--split", type=int, default=1, help="Split the job into pieces")
67parser.add_argument(
68    "--rank", type=int, default=0, help="Which piece this process should pick"
69)
70args = parser.parse_args()
71
72
73# Filters that ignores cublas/cudnn errors
74# TODO (@zasdfgbnm): When can we remove this? Will cublas/cudnn run error-free under cuda-memcheck?
75def is_ignored_only(output):
76    try:
77        report = cmc.parse(output)
78    except cmc.ParseError:
79        # in case the simple parser fails parsing the output of cuda memcheck
80        # then this error is never ignored.
81        return False
82    count_ignored_errors = 0
83    for e in report.errors:
84        if (
85            "libcublas" in "".join(e.stack)
86            or "libcudnn" in "".join(e.stack)
87            or "libcufft" in "".join(e.stack)
88        ):
89            count_ignored_errors += 1
90    return count_ignored_errors == report.num_errors
91
92
93# Set environment PYTORCH_CUDA_MEMCHECK=1 to allow skipping some tests
94os.environ["PYTORCH_CUDA_MEMCHECK"] = "1"
95
96# Discover tests:
97# To get a list of tests, run:
98# pytest --setup-only test/test_torch.py
99# and then parse the output
100proc = subprocess.Popen(
101    ["pytest", "--setup-only", args.filename],
102    stdout=subprocess.PIPE,
103    stderr=subprocess.PIPE,
104)
105stdout, stderr = proc.communicate()
106lines = stdout.decode().strip().splitlines()
107for line in lines:
108    if "(fixtures used:" in line:
109        line = line.strip().split()[0]
110        line = line[line.find("::") + 2 :]
111        line = line.replace("::", ".")
112        ALL_TESTS.append(line)
113
114
115# Do a simple filtering:
116# if 'cpu' or 'CPU' is in the name and 'cuda' or 'CUDA' is not in the name, then skip it
117def is_cpu_only(name):
118    name = name.lower()
119    return ("cpu" in name) and "cuda" not in name
120
121
122ALL_TESTS = [x for x in ALL_TESTS if not is_cpu_only(x)]
123
124# Split all tests into chunks, and only on the selected chunk
125ALL_TESTS.sort()
126chunk_size = (len(ALL_TESTS) + args.split - 1) // args.split
127start = chunk_size * args.rank
128end = chunk_size * (args.rank + 1)
129ALL_TESTS = ALL_TESTS[start:end]
130
131# Run tests:
132# Since running cuda-memcheck on PyTorch unit tests is very slow, these tests must be run in parallel.
133# This is done by using the coroutine feature in new Python versions.  A number of coroutines are created;
134# they create subprocesses and awaiting them to finish. The number of running subprocesses could be
135# specified by the user and by default is the same as the number of CPUs in the machine.
136# These subprocesses are balanced across different GPUs on the system by assigning one devices per process,
137# or as specified by the user
138progress = 0
139if not args.ci:
140    logfile = open("result.log", "w")
141    progressbar = tqdm.tqdm(total=len(ALL_TESTS))
142else:
143    logfile = sys.stdout
144
145    # create a fake progress bar that does not display anything
146    class ProgressbarStub:
147        def update(self, *args):
148            return
149
150    progressbar = ProgressbarStub()
151
152
153async def run1(coroutine_id):
154    global progress
155
156    if args.gpus == "all":
157        gpuid = coroutine_id % GPUS
158    else:
159        gpu_assignments = args.gpus.split(":")
160        assert args.nproc == len(
161            gpu_assignments
162        ), "Please specify GPU assignment for each process, separated by :"
163        gpuid = gpu_assignments[coroutine_id]
164
165    while progress < len(ALL_TESTS):
166        test = ALL_TESTS[progress]
167        progress += 1
168        cmd = f"CUDA_VISIBLE_DEVICES={gpuid} cuda-memcheck --error-exitcode 1 python {args.filename} {test}"
169        proc = await asyncio.create_subprocess_shell(
170            cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
171        )
172        try:
173            stdout, stderr = await asyncio.wait_for(proc.communicate(), args.timeout)
174        except asyncio.TimeoutError:
175            print("Timeout:", test, file=logfile)
176            proc.kill()
177            if args.ci and not args.nohang:
178                sys.exit("Hang detected on cuda-memcheck")
179        else:
180            if proc.returncode == 0:
181                print("Success:", test, file=logfile)
182            else:
183                stdout = stdout.decode()
184                stderr = stderr.decode()
185                should_display = args.strict or not is_ignored_only(stdout)
186                if should_display:
187                    print("Fail:", test, file=logfile)
188                    print(stdout, file=logfile)
189                    print(stderr, file=logfile)
190                    if args.ci:
191                        sys.exit("Failure detected on cuda-memcheck")
192                else:
193                    print("Ignored:", test, file=logfile)
194        del proc
195        progressbar.update(1)
196
197
198async def main():
199    tasks = [asyncio.ensure_future(run1(i)) for i in range(args.nproc)]
200    for t in tasks:
201        await t
202
203
204if __name__ == "__main__":
205    loop = asyncio.get_event_loop()
206    loop.run_until_complete(main())
207