• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1name: linux-test
2
3inputs:
4  build-environment:
5    required: true
6    type: string
7    description: Top-level label for what's being built/tested.
8  test-matrix:
9    required: true
10    type: string
11    description: JSON description of what test configs to run.
12  docker-image:
13    required: true
14    type: string
15    description: Docker image to run in.
16  sync-tag:
17    required: false
18    type: string
19    default: ""
20    description: |
21      If this is set, our linter will use this to make sure that every other
22      job with the same `sync-tag` is identical.
23  use-gha:
24    required: false
25    type: string
26    default: ""
27    description: If set to any value, upload to GHA. Otherwise upload to S3.
28  dashboard-tag:
29    required: false
30    type: string
31    default: ""
32  s3-bucket:
33    description: S3 bucket to download artifact
34    required: false
35    type: string
36    default: "gha-artifacts"
37  aws-role-to-assume:
38    description: role to assume for downloading artifacts
39    required: false
40    type: string
41    default: ""
42  HUGGING_FACE_HUB_TOKEN:
43    description: |
44      HF Auth token to avoid rate limits when downloading models or datasets from hub
45    required: false
46    default: ""
47  GITHUB_TOKEN:
48    description: GitHub token
49    required: true
50
51#env:
52#  GIT_DEFAULT_BRANCH: ${{ inputs.default_branch }}
53
54runs:
55  using: composite
56  steps:
57    - name: Setup Linux
58      uses: ./.github/actions/setup-linux
59
60    - name: configure aws credentials
61      if : ${{ inputs.aws-role-to-assume != '' }}
62      uses: aws-actions/configure-aws-credentials@v3
63      with:
64        role-to-assume: ${{ inputs.aws-role-to-assume }}
65        role-session-name: gha-linux-test
66        aws-region: us-east-1
67
68    - name: Calculate docker image
69      id: calculate-docker-image
70      uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
71      with:
72        docker-image-name: ${{ inputs.docker-image }}
73
74    - name: Use following to pull public copy of the image
75      id: print-ghcr-mirror
76      env:
77        ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
78      shell: bash
79      run: |
80        tag=${ECR_DOCKER_IMAGE##*/}
81        echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
82
83    - name: Pull docker image
84      uses: pytorch/test-infra/.github/actions/pull-docker-image@main
85      with:
86        docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
87
88    - name: Check if in a ARC runner
89      shell: bash
90      id: check_arc_runner
91      run: echo "IN_ARC_RUNNER=$([ -f /.inarc ] && echo true || echo false)" >> "$GITHUB_OUTPUT"
92
93    - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
94      id: install-nvidia-driver
95      uses: pytorch/test-infra/.github/actions/setup-nvidia@main
96      if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
97
98    - name: Lock NVIDIA A100 40GB Frequency
99      shell: bash
100      run: |
101        sudo nvidia-smi -pm 1
102        sudo nvidia-smi -ac 1215,1410
103        nvidia-smi
104      if: contains(matrix.runner, 'a100')
105
106    - name: Start monitoring script
107      id: monitor-script
108      shell: bash
109      continue-on-error: true
110      run: |
111        python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84
112        python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
113        echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
114
115    - name: Download build artifacts
116      uses: ./.github/actions/download-build-artifacts
117      with:
118        name: ${{ inputs.build-environment }}
119        s3-bucket: ${{ inputs.s3-bucket }}
120
121    - name: Download TD artifacts
122      continue-on-error: true
123      uses: ./.github/actions/download-td-artifacts
124
125    - name: Parse ref
126      id: parse-ref
127      shell: bash
128      run: .github/scripts/parse_ref.py
129
130    - name: Get workflow job id
131      id: get-job-id
132      uses: ./.github/actions/get-workflow-job-id
133      if: always()
134      with:
135        github-token: ${{ inputs.GITHUB_TOKEN }}
136
137    - name: Check for keep-going label and re-enabled test issues
138      # This uses the filter-test-configs action because it conviniently
139      # checks for labels and re-enabled test issues.  It does not actually do
140      # any filtering.  All filtering is done in the build step.
141      id: keep-going
142      uses: ./.github/actions/filter-test-configs
143      with:
144        github-token: ${{ inputs.GITHUB_TOKEN }}
145        test-matrix: ${{ inputs.test-matrix }}
146        job-name: ${{ steps.get-job-id.outputs.job-name }}
147
148    - name: Test
149      id: test
150      env:
151        BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
152        PR_NUMBER: ${{ github.event.pull_request.number }}
153        GITHUB_REPOSITORY: ${{ github.repository }}
154        GITHUB_WORKFLOW: ${{ github.workflow }}
155        GITHUB_JOB: ${{ github.job }}
156        GITHUB_RUN_ID: ${{ github.run_id }}
157        GITHUB_RUN_NUMBER: ${{ github.run_number }}
158        GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
159        JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
160        JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
161        BRANCH: ${{ steps.parse-ref.outputs.branch }}
162        SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
163        BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }}
164        TEST_CONFIG: ${{ matrix.config }}
165        SHARD_NUMBER: ${{ matrix.shard }}
166        NUM_TEST_SHARDS: ${{ matrix.num_shards }}
167        REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
168        CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
169        VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
170        TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}
171        NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
172        NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
173        TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }}
174        SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
175        SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
176        SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
177        DOCKER_IMAGE: ${{ inputs.docker-image }}
178        XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
179        XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
180        PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
181        PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
182        DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
183        HUGGING_FACE_HUB_TOKEN: ${{ inputs.HUGGING_FACE_HUB_TOKEN }}
184      shell: bash
185      run: |
186        set -x
187
188        if [[ $TEST_CONFIG == 'multigpu' ]]; then
189          TEST_COMMAND=.ci/pytorch/multigpu-test.sh
190        elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
191          TEST_COMMAND=.ci/onnx/test.sh
192        else
193          TEST_COMMAND=.ci/pytorch/test.sh
194        fi
195
196        # detached container should get cleaned up by teardown_ec2_linux
197        # TODO: Stop building test binaries as part of the build phase
198        # Used for GPU_FLAG since that doesn't play nice
199        # shellcheck disable=SC2086,SC2090
200        container_name=$(docker run \
201          ${GPU_FLAG:-} \
202          -e BUILD_ENVIRONMENT \
203          -e PR_NUMBER \
204          -e GITHUB_ACTIONS \
205          -e GITHUB_REPOSITORY \
206          -e GITHUB_WORKFLOW \
207          -e GITHUB_JOB \
208          -e GITHUB_RUN_ID \
209          -e GITHUB_RUN_NUMBER \
210          -e GITHUB_RUN_ATTEMPT \
211          -e JOB_ID \
212          -e JOB_NAME \
213          -e BASE_SHA \
214          -e BRANCH \
215          -e SHA1 \
216          -e AWS_DEFAULT_REGION \
217          -e IN_WHEEL_TEST \
218          -e SHARD_NUMBER \
219          -e TEST_CONFIG \
220          -e NUM_TEST_SHARDS \
221          -e REENABLED_ISSUES \
222          -e CONTINUE_THROUGH_ERROR \
223          -e VERBOSE_TEST_LOGS \
224          -e NO_TEST_TIMEOUT \
225          -e NO_TD \
226          -e TD_DISTRIBUTED \
227          -e PR_LABELS \
228          -e MAX_JOBS="$(nproc --ignore=2)" \
229          -e SCCACHE_BUCKET \
230          -e SCCACHE_S3_KEY_PREFIX \
231          -e XLA_CUDA \
232          -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
233          -e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
234          -e PYTORCH_TEST_RERUN_DISABLED_TESTS \
235          -e SKIP_SCCACHE_INITIALIZATION=1 \
236          -e HUGGING_FACE_HUB_TOKEN \
237          -e DASHBOARD_TAG \
238          --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
239          --security-opt seccomp=unconfined \
240          --cap-add=SYS_PTRACE \
241          --ipc=host \
242          --shm-size="${SHM_SIZE}" \
243          --tty \
244          --detach \
245          --name="${container_name}" \
246          --user jenkins \
247          -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
248          -w /var/lib/jenkins/workspace \
249          "${DOCKER_IMAGE}"
250        )
251        # Propagate download.pytorch.org IP to container
252        grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts"
253        echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
254        docker exec -t "${container_name}" sh -c "pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}"
255
256    - name: Upload pytest cache if tests failed
257      uses: ./.github/actions/pytest-cache-upload
258      continue-on-error: true
259      if: failure() && steps.test.conclusion && steps.test.conclusion == 'failure'
260      with:
261        cache_dir: .pytest_cache
262        shard: ${{ matrix.shard }}
263        sha: ${{ github.event.pull_request.head.sha || github.sha }}
264        test_config: ${{ matrix.config }}
265        job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
266
267    - name: Print remaining test logs
268      shell: bash
269      if: always() && steps.test.conclusion
270      run: |
271        cat test/**/*_toprint.log || true
272
273    - name: Stop monitoring script
274      if: always() && steps.monitor-script.outputs.monitor-script-pid
275      shell: bash
276      continue-on-error: true
277      env:
278        MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }}
279      run: |
280        kill "$MONITOR_SCRIPT_PID"
281
282    - name: Upload test artifacts
283      uses: ./.github/actions/upload-test-artifacts
284      if: always() && steps.test.conclusion && steps.test.conclusion != 'skipped'
285      with:
286        file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
287        use-gha: ${{ inputs.use-gha }}
288        s3-bucket: ${{ inputs.s3-bucket }}
289
290    - name: Collect backtraces from coredumps (if any)
291      if: always()
292      shell: bash
293      run: |
294        # shellcheck disable=SC2156
295        find . -iname "core.[1-9]*" -exec docker exec "${DOCKER_CONTAINER_ID}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;
296
297    - name: Store Core dumps on S3
298      uses: seemethere/upload-artifact-s3@v5
299      if: failure()
300      with:
301        name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}
302        retention-days: 14
303        if-no-files-found: ignore
304        path: ./**/core.[1-9]*
305
306    - name: Teardown Linux
307      uses: pytorch/test-infra/.github/actions/teardown-linux@main
308      if: always()
309
310    # NB: We are currently having an intermittent GPU-related issue on G5 runners with
311    # A10G GPU. Once this happens, trying to reset the GPU as done in setup-nvidia does
312    # not seem to help. Here are some symptoms:
313    #   * Calling nvidia-smi timeouts after 60 second
314    #   * Fail to run nvidia-smi with an unable to determine the device handle for GPU
315    #     unknown error
316    #   * Test fails with a missing CUDA GPU error when initializing CUDA in PyTorch
317    #   * Run docker --gpus all fails with error response from daemon
318    #
319    # As both the root cause and recovery path are unclear, let's take the runner out of
320    # service so that it doesn't get any more jobs
321    - name: Check NVIDIA driver installation step
322      if: failure() && steps.install-nvidia-driver.outcome && steps.install-nvidia-driver.outcome != 'skipped'
323      shell: bash
324      env:
325        RUNNER_WORKSPACE: ${{ runner.workspace }}
326      run: |
327        set +e
328        set -x
329
330        nvidia-smi
331        # NB: Surprisingly, nvidia-smi command returns successfully with return code 0 even in
332        # the case where the driver has already crashed as it still can get the driver version
333        # and some basic information like the bus ID.  However, the rest of the information
334        # would be missing (ERR!), for example:
335        #
336        # +-----------------------------------------------------------------------------+
337        # | NVIDIA-SMI 525.89.02    Driver Version: 525.89.02    CUDA Version: 12.0     |
338        # |-------------------------------+----------------------+----------------------+
339        # | GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
340        # | Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
341        # |                               |                      |               MIG M. |
342        # |===============================+======================+======================|
343        # |   0  ERR!                Off  | 00000000:00:1E.0 Off |                 ERR! |
344        # |ERR!  ERR! ERR!    ERR! / ERR! |   4184MiB / 23028MiB |    ERR!      Default |
345        # |                               |                      |                 ERR! |
346        # +-------------------------------+----------------------+----------------------+
347        #
348        # +-----------------------------------------------------------------------------+
349        # | Processes:                                                                  |
350        # |  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
351        # |        ID   ID                                                   Usage      |
352        # |=============================================================================|
353        # +-----------------------------------------------------------------------------+
354        #
355        # This should be reported as a failure instead as it will guarantee to fail when
356        # Docker tries to run with --gpus all
357        #
358        # So, the correct check here is to query one of the missing piece of info like
359        # GPU name, so that the command can fail accordingly
360        nvidia-smi --query-gpu=gpu_name --format=csv,noheader --id=0
361        NVIDIA_SMI_STATUS=$?
362
363        # These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action
364        if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then
365          echo "NVIDIA driver installation has failed, shutting down the runner..."
366          .github/scripts/stop_runner_service.sh
367        fi
368
369        # For runner with multiple GPUs, we also want to confirm that the number of GPUs are the
370        # power of 2, i.e. 1, 2, 4, or 8. This is to avoid flaky test issue when one GPU fails
371        # https://github.com/pytorch/test-infra/issues/4000
372        GPU_COUNT=$(nvidia-smi --list-gpus | wc -l)
373        NVIDIA_SMI_STATUS=$?
374
375        # These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action
376        if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then
377          echo "NVIDIA driver installation has failed, shutting down the runner..."
378          .github/scripts/stop_runner_service.sh
379        fi
380
381        # Check the GPU count to be a power of 2
382        if [ "$GPU_COUNT" -le 8 ] && [ "$GPU_COUNT" -ne 1 ] && [ "$GPU_COUNT" -ne 2 ] && [ "$GPU_COUNT" -ne 4 ] && [ "$GPU_COUNT" -ne 8 ]; then
383          echo "NVIDIA driver detects $GPU_COUNT GPUs. The runner has a broken GPU, shutting it down..."
384          .github/scripts/stop_runner_service.sh
385        fi
386