• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1name: linux-test
2
3on:
4  workflow_call:
5    inputs:
6      build-environment:
7        required: true
8        type: string
9        description: Top-level label for what's being built/tested.
10      test-matrix:
11        required: true
12        type: string
13        description: JSON description of what test configs to run.
14      docker-image:
15        required: true
16        type: string
17        description: Docker image to run in.
18      sync-tag:
19        required: false
20        type: string
21        default: ""
22        description: |
23          If this is set, our linter will use this to make sure that every other
24          job with the same `sync-tag` is identical.
25      timeout-minutes:
26        required: false
27        type: number
28        default: 240
29        description: |
30          Set the maximum (in minutes) how long the workflow should take to finish
31      use-gha:
32        required: false
33        type: string
34        default: ""
35        description: If set to any value, upload to GHA. Otherwise upload to S3.
36      dashboard-tag:
37        required: false
38        type: string
39        default: ""
40      s3-bucket:
41        description: S3 bucket to download artifact
42        required: false
43        type: string
44        default: "gha-artifacts"
45      aws-role-to-assume:
46        description: role to assume for downloading artifacts
47        required: false
48        type: string
49        default: ""
50    secrets:
51      HUGGING_FACE_HUB_TOKEN:
52        required: false
53        description: |
54          HF Auth token to avoid rate limits when downloading models or datasets from hub
55
56env:
57  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
58
59jobs:
60  test:
61    # Don't run on forked repos or empty test matrix
62    if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]'
63    strategy:
64      matrix: ${{ fromJSON(inputs.test-matrix) }}
65      fail-fast: false
66    runs-on: ${{ matrix.runner }}
67    timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
68    steps:
69      - name: Setup SSH (Click me for login details)
70        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.4
71        if: ${{ !contains(matrix.runner, 'gcp.a100') }}
72        with:
73          github-secret: ${{ secrets.GITHUB_TOKEN }}
74          instructions: |
75            All testing is done inside the container, to start an interactive session run:
76              docker exec -it $(docker container ps --format '{{.ID}}') bash
77
78      - name: Checkout PyTorch
79        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.4
80
81      - name: Setup Linux
82        uses: ./.github/actions/setup-linux
83
84      - name: configure aws credentials
85        if : ${{ inputs.aws-role-to-assume != '' }}
86        uses: aws-actions/configure-aws-credentials@v3
87        with:
88          role-to-assume: ${{ inputs.aws-role-to-assume }}
89          role-session-name: gha-linux-test
90          aws-region: us-east-1
91
92      - name: Calculate docker image
93        id: calculate-docker-image
94        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.4
95        with:
96          docker-image-name: ${{ inputs.docker-image }}
97
98      - name: Use following to pull public copy of the image
99        id: print-ghcr-mirror
100        env:
101          ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
102        shell: bash
103        run: |
104          tag=${ECR_DOCKER_IMAGE##*/}
105          echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
106
107      - name: Pull docker image
108        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.4
109        with:
110          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
111
112      - name: Check if in a ARC runner
113        shell: bash
114        id: check_arc_runner
115        run: echo "IN_ARC_RUNNER=$([ -f /.inarc ] && echo true || echo false)" >> "$GITHUB_OUTPUT"
116
117      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
118        id: install-nvidia-driver
119        uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.4
120        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
121
122      - name: Lock NVIDIA A100 40GB Frequency
123        run: |
124          sudo nvidia-smi -pm 1
125          sudo nvidia-smi -ac 1215,1410
126          nvidia-smi
127        if: contains(matrix.runner, 'a100')
128
129      - name: Start monitoring script
130        id: monitor-script
131        shell: bash
132        continue-on-error: true
133        run: |
134          python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84
135          python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
136          echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
137
138      - name: Download build artifacts
139        uses: ./.github/actions/download-build-artifacts
140        with:
141          name: ${{ inputs.build-environment }}
142          s3-bucket: ${{ inputs.s3-bucket }}
143
144      - name: Download TD artifacts
145        continue-on-error: true
146        uses: ./.github/actions/download-td-artifacts
147
148      - name: Parse ref
149        id: parse-ref
150        run: .github/scripts/parse_ref.py
151
152      - name: Get workflow job id
153        id: get-job-id
154        uses: ./.github/actions/get-workflow-job-id
155        if: always()
156        with:
157          github-token: ${{ secrets.GITHUB_TOKEN }}
158
159      - name: Check for keep-going label and re-enabled test issues
160        # This uses the filter-test-configs action because it conviniently
161        # checks for labels and re-enabled test issues.  It does not actually do
162        # any filtering.  All filtering is done in the build step.
163        id: keep-going
164        uses: ./.github/actions/filter-test-configs
165        with:
166          github-token: ${{ secrets.GITHUB_TOKEN }}
167          test-matrix: ${{ inputs.test-matrix }}
168          job-name: ${{ steps.get-job-id.outputs.job-name }}
169
170      - name: Set Test step time
171        id: test-timeout
172        shell: bash
173        env:
174          JOB_TIMEOUT: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
175        run: |
176          echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}"
177
178      - name: Test
179        id: test
180        timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
181        env:
182          BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
183          PR_NUMBER: ${{ github.event.pull_request.number }}
184          GITHUB_REPOSITORY: ${{ github.repository }}
185          GITHUB_WORKFLOW: ${{ github.workflow }}
186          GITHUB_JOB: ${{ github.job }}
187          GITHUB_RUN_ID: ${{ github.run_id }}
188          GITHUB_RUN_NUMBER: ${{ github.run_number }}
189          GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
190          JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
191          JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
192          BRANCH: ${{ steps.parse-ref.outputs.branch }}
193          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
194          BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }}
195          TEST_CONFIG: ${{ matrix.config }}
196          SHARD_NUMBER: ${{ matrix.shard }}
197          NUM_TEST_SHARDS: ${{ matrix.num_shards }}
198          REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
199          CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
200          VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
201          NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
202          NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
203          TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }}
204          SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
205          SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
206          SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
207          DOCKER_IMAGE: ${{ inputs.docker-image }}
208          XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
209          XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
210          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
211          PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
212          DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
213          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
214
215        run: |
216          set -x
217
218          if [[ $TEST_CONFIG == 'multigpu' ]]; then
219            TEST_COMMAND=.ci/pytorch/multigpu-test.sh
220          elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
221            TEST_COMMAND=.ci/onnx/test.sh
222          else
223            TEST_COMMAND=.ci/pytorch/test.sh
224          fi
225
226          # detached container should get cleaned up by teardown_ec2_linux
227          # TODO: Stop building test binaries as part of the build phase
228          # Used for GPU_FLAG since that doesn't play nice
229          # shellcheck disable=SC2086,SC2090
230          container_name=$(docker run \
231            ${GPU_FLAG:-} \
232            -e BUILD_ENVIRONMENT \
233            -e PR_NUMBER \
234            -e GITHUB_ACTIONS \
235            -e GITHUB_REPOSITORY \
236            -e GITHUB_WORKFLOW \
237            -e GITHUB_JOB \
238            -e GITHUB_RUN_ID \
239            -e GITHUB_RUN_NUMBER \
240            -e GITHUB_RUN_ATTEMPT \
241            -e JOB_ID \
242            -e JOB_NAME \
243            -e BASE_SHA \
244            -e BRANCH \
245            -e SHA1 \
246            -e AWS_DEFAULT_REGION \
247            -e IN_WHEEL_TEST \
248            -e SHARD_NUMBER \
249            -e TEST_CONFIG \
250            -e NUM_TEST_SHARDS \
251            -e REENABLED_ISSUES \
252            -e CONTINUE_THROUGH_ERROR \
253            -e VERBOSE_TEST_LOGS \
254            -e NO_TEST_TIMEOUT \
255            -e NO_TD \
256            -e TD_DISTRIBUTED \
257            -e PR_LABELS \
258            -e MAX_JOBS="$(nproc --ignore=2)" \
259            -e SCCACHE_BUCKET \
260            -e SCCACHE_S3_KEY_PREFIX \
261            -e XLA_CUDA \
262            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
263            -e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
264            -e PYTORCH_TEST_RERUN_DISABLED_TESTS \
265            -e SKIP_SCCACHE_INITIALIZATION=1 \
266            -e HUGGING_FACE_HUB_TOKEN \
267            -e DASHBOARD_TAG \
268            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
269            --security-opt seccomp=unconfined \
270            --cap-add=SYS_PTRACE \
271            --ipc=host \
272            --shm-size="${SHM_SIZE}" \
273            --tty \
274            --detach \
275            --name="${container_name}" \
276            --user jenkins \
277            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
278            -w /var/lib/jenkins/workspace \
279            "${DOCKER_IMAGE}"
280          )
281          # Propagate download.pytorch.org IP to container
282          grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts"
283          echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
284          docker exec -t "${container_name}" sh -c "pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}"
285
286      - name: Upload pytest cache if tests failed
287        uses: ./.github/actions/pytest-cache-upload
288        continue-on-error: true
289        if: failure() && steps.test.conclusion && steps.test.conclusion == 'failure'
290        with:
291          cache_dir: .pytest_cache
292          shard: ${{ matrix.shard }}
293          sha: ${{ github.event.pull_request.head.sha || github.sha }}
294          test_config: ${{ matrix.config }}
295          job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
296
297      - name: Print remaining test logs
298        shell: bash
299        if: always() && steps.test.conclusion
300        run: |
301          cat test/**/*_toprint.log || true
302
303      - name: Stop monitoring script
304        if: always() && steps.monitor-script.outputs.monitor-script-pid
305        shell: bash
306        continue-on-error: true
307        env:
308          MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }}
309        run: |
310          kill "$MONITOR_SCRIPT_PID"
311
312      - name: Upload test artifacts
313        uses: ./.github/actions/upload-test-artifacts
314        if: always() && steps.test.conclusion && steps.test.conclusion != 'skipped'
315        with:
316          file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
317          use-gha: ${{ inputs.use-gha }}
318          s3-bucket: ${{ inputs.s3-bucket }}
319
320      - name: Collect backtraces from coredumps (if any)
321        if: always()
322        run: |
323          # shellcheck disable=SC2156
324          find . -iname "core.[1-9]*" -exec docker exec "${DOCKER_CONTAINER_ID}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;
325
326      - name: Store Core dumps on S3
327        uses: seemethere/upload-artifact-s3@v5
328        if: failure()
329        with:
330          name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}
331          retention-days: 14
332          if-no-files-found: ignore
333          path: ./**/core.[1-9]*
334
335      - name: Teardown Linux
336        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.4
337        if: always()
338
339      # NB: We are currently having an intermittent GPU-related issue on G5 runners with
340      # A10G GPU. Once this happens, trying to reset the GPU as done in setup-nvidia does
341      # not seem to help. Here are some symptoms:
342      #   * Calling nvidia-smi timeouts after 60 second
343      #   * Fail to run nvidia-smi with an unable to determine the device handle for GPU
344      #     unknown error
345      #   * Test fails with a missing CUDA GPU error when initializing CUDA in PyTorch
346      #   * Run docker --gpus all fails with error response from daemon
347      #
348      # As both the root cause and recovery path are unclear, let's take the runner out of
349      # service so that it doesn't get any more jobs
350      - name: Check NVIDIA driver installation step
351        if: failure() && steps.install-nvidia-driver.outcome && steps.install-nvidia-driver.outcome != 'skipped'
352        shell: bash
353        env:
354          RUNNER_WORKSPACE: ${{ runner.workspace }}
355        run: |
356          set +e
357          set -x
358
359          nvidia-smi
360          # NB: Surprisingly, nvidia-smi command returns successfully with return code 0 even in
361          # the case where the driver has already crashed as it still can get the driver version
362          # and some basic information like the bus ID.  However, the rest of the information
363          # would be missing (ERR!), for example:
364          #
365          # +-----------------------------------------------------------------------------+
366          # | NVIDIA-SMI 525.89.02    Driver Version: 525.89.02    CUDA Version: 12.0     |
367          # |-------------------------------+----------------------+----------------------+
368          # | GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
369          # | Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
370          # |                               |                      |               MIG M. |
371          # |===============================+======================+======================|
372          # |   0  ERR!                Off  | 00000000:00:1E.0 Off |                 ERR! |
373          # |ERR!  ERR! ERR!    ERR! / ERR! |   4184MiB / 23028MiB |    ERR!      Default |
374          # |                               |                      |                 ERR! |
375          # +-------------------------------+----------------------+----------------------+
376          #
377          # +-----------------------------------------------------------------------------+
378          # | Processes:                                                                  |
379          # |  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
380          # |        ID   ID                                                   Usage      |
381          # |=============================================================================|
382          # +-----------------------------------------------------------------------------+
383          #
384          # This should be reported as a failure instead as it will guarantee to fail when
385          # Docker tries to run with --gpus all
386          #
387          # So, the correct check here is to query one of the missing piece of info like
388          # GPU name, so that the command can fail accordingly
389          nvidia-smi --query-gpu=gpu_name --format=csv,noheader --id=0
390          NVIDIA_SMI_STATUS=$?
391
392          # These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action
393          if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then
394            echo "NVIDIA driver installation has failed, shutting down the runner..."
395            .github/scripts/stop_runner_service.sh
396          fi
397
398          # For runner with multiple GPUs, we also want to confirm that the number of GPUs are the
399          # power of 2, i.e. 1, 2, 4, or 8. This is to avoid flaky test issue when one GPU fails
400          # https://github.com/pytorch/test-infra/issues/4000
401          GPU_COUNT=$(nvidia-smi --list-gpus | wc -l)
402          NVIDIA_SMI_STATUS=$?
403
404          # These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action
405          if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then
406            echo "NVIDIA driver installation has failed, shutting down the runner..."
407            .github/scripts/stop_runner_service.sh
408          fi
409
410          # Check the GPU count to be a power of 2
411          if [ "$GPU_COUNT" -le 8 ] && [ "$GPU_COUNT" -ne 1 ] && [ "$GPU_COUNT" -ne 2 ] && [ "$GPU_COUNT" -ne 4 ] && [ "$GPU_COUNT" -ne 8 ]; then
412            echo "NVIDIA driver detects $GPU_COUNT GPUs. The runner has a broken GPU, shutting it down..."
413            .github/scripts/stop_runner_service.sh
414          fi
415