1name: linux-test 2 3on: 4 workflow_call: 5 inputs: 6 build-environment: 7 required: true 8 type: string 9 description: Top-level label for what's being built/tested. 10 test-matrix: 11 required: true 12 type: string 13 description: JSON description of what test configs to run. 14 docker-image: 15 required: true 16 type: string 17 description: Docker image to run in. 18 sync-tag: 19 required: false 20 type: string 21 default: "" 22 description: | 23 If this is set, our linter will use this to make sure that every other 24 job with the same `sync-tag` is identical. 25 timeout-minutes: 26 required: false 27 type: number 28 default: 240 29 description: | 30 Set the maximum (in minutes) how long the workflow should take to finish 31 use-gha: 32 required: false 33 type: string 34 default: "" 35 description: If set to any value, upload to GHA. Otherwise upload to S3. 36 dashboard-tag: 37 required: false 38 type: string 39 default: "" 40 s3-bucket: 41 description: S3 bucket to download artifact 42 required: false 43 type: string 44 default: "gha-artifacts" 45 aws-role-to-assume: 46 description: role to assume for downloading artifacts 47 required: false 48 type: string 49 default: "" 50 secrets: 51 HUGGING_FACE_HUB_TOKEN: 52 required: false 53 description: | 54 HF Auth token to avoid rate limits when downloading models or datasets from hub 55 56env: 57 GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} 58 59jobs: 60 test: 61 # Don't run on forked repos or empty test matrix 62 if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]' 63 strategy: 64 matrix: ${{ fromJSON(inputs.test-matrix) }} 65 fail-fast: false 66 runs-on: ${{ matrix.runner }} 67 timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }} 68 steps: 69 - name: Setup SSH (Click me for login details) 70 uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.4 71 if: ${{ !contains(matrix.runner, 'gcp.a100') }} 72 with: 73 github-secret: ${{ secrets.GITHUB_TOKEN }} 74 instructions: | 75 All testing is done inside the container, to start an interactive session run: 76 docker exec -it $(docker container ps --format '{{.ID}}') bash 77 78 - name: Checkout PyTorch 79 uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.4 80 81 - name: Setup Linux 82 uses: ./.github/actions/setup-linux 83 84 - name: configure aws credentials 85 if : ${{ inputs.aws-role-to-assume != '' }} 86 uses: aws-actions/configure-aws-credentials@v3 87 with: 88 role-to-assume: ${{ inputs.aws-role-to-assume }} 89 role-session-name: gha-linux-test 90 aws-region: us-east-1 91 92 - name: Calculate docker image 93 id: calculate-docker-image 94 uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.4 95 with: 96 docker-image-name: ${{ inputs.docker-image }} 97 98 - name: Use following to pull public copy of the image 99 id: print-ghcr-mirror 100 env: 101 ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} 102 shell: bash 103 run: | 104 tag=${ECR_DOCKER_IMAGE##*/} 105 echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}" 106 107 - name: Pull docker image 108 uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.4 109 with: 110 docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} 111 112 - name: Check if in a ARC runner 113 shell: bash 114 id: check_arc_runner 115 run: echo "IN_ARC_RUNNER=$([ -f /.inarc ] && echo true || echo false)" >> "$GITHUB_OUTPUT" 116 117 - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG 118 id: install-nvidia-driver 119 uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.4 120 if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }} 121 122 - name: Lock NVIDIA A100 40GB Frequency 123 run: | 124 sudo nvidia-smi -pm 1 125 sudo nvidia-smi -ac 1215,1410 126 nvidia-smi 127 if: contains(matrix.runner, 'a100') 128 129 - name: Start monitoring script 130 id: monitor-script 131 shell: bash 132 continue-on-error: true 133 run: | 134 python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84 135 python3 -m tools.stats.monitor > usage_log.txt 2>&1 & 136 echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}" 137 138 - name: Download build artifacts 139 uses: ./.github/actions/download-build-artifacts 140 with: 141 name: ${{ inputs.build-environment }} 142 s3-bucket: ${{ inputs.s3-bucket }} 143 144 - name: Download TD artifacts 145 continue-on-error: true 146 uses: ./.github/actions/download-td-artifacts 147 148 - name: Parse ref 149 id: parse-ref 150 run: .github/scripts/parse_ref.py 151 152 - name: Get workflow job id 153 id: get-job-id 154 uses: ./.github/actions/get-workflow-job-id 155 if: always() 156 with: 157 github-token: ${{ secrets.GITHUB_TOKEN }} 158 159 - name: Check for keep-going label and re-enabled test issues 160 # This uses the filter-test-configs action because it conviniently 161 # checks for labels and re-enabled test issues. It does not actually do 162 # any filtering. All filtering is done in the build step. 163 id: keep-going 164 uses: ./.github/actions/filter-test-configs 165 with: 166 github-token: ${{ secrets.GITHUB_TOKEN }} 167 test-matrix: ${{ inputs.test-matrix }} 168 job-name: ${{ steps.get-job-id.outputs.job-name }} 169 170 - name: Set Test step time 171 id: test-timeout 172 shell: bash 173 env: 174 JOB_TIMEOUT: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }} 175 run: | 176 echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}" 177 178 - name: Test 179 id: test 180 timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }} 181 env: 182 BUILD_ENVIRONMENT: ${{ inputs.build-environment }} 183 PR_NUMBER: ${{ github.event.pull_request.number }} 184 GITHUB_REPOSITORY: ${{ github.repository }} 185 GITHUB_WORKFLOW: ${{ github.workflow }} 186 GITHUB_JOB: ${{ github.job }} 187 GITHUB_RUN_ID: ${{ github.run_id }} 188 GITHUB_RUN_NUMBER: ${{ github.run_number }} 189 GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }} 190 JOB_ID: ${{ steps.get-job-id.outputs.job-id }} 191 JOB_NAME: ${{ steps.get-job-id.outputs.job-name }} 192 BRANCH: ${{ steps.parse-ref.outputs.branch }} 193 SHA1: ${{ github.event.pull_request.head.sha || github.sha }} 194 BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }} 195 TEST_CONFIG: ${{ matrix.config }} 196 SHARD_NUMBER: ${{ matrix.shard }} 197 NUM_TEST_SHARDS: ${{ matrix.num_shards }} 198 REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }} 199 CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }} 200 VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }} 201 NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }} 202 NO_TD: ${{ steps.keep-going.outputs.ci-no-td }} 203 TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }} 204 SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 205 SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }} 206 SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }} 207 DOCKER_IMAGE: ${{ inputs.docker-image }} 208 XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }} 209 XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla 210 PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }} 211 PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }} 212 DASHBOARD_TAG: ${{ inputs.dashboard-tag }} 213 HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} 214 215 run: | 216 set -x 217 218 if [[ $TEST_CONFIG == 'multigpu' ]]; then 219 TEST_COMMAND=.ci/pytorch/multigpu-test.sh 220 elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then 221 TEST_COMMAND=.ci/onnx/test.sh 222 else 223 TEST_COMMAND=.ci/pytorch/test.sh 224 fi 225 226 # detached container should get cleaned up by teardown_ec2_linux 227 # TODO: Stop building test binaries as part of the build phase 228 # Used for GPU_FLAG since that doesn't play nice 229 # shellcheck disable=SC2086,SC2090 230 container_name=$(docker run \ 231 ${GPU_FLAG:-} \ 232 -e BUILD_ENVIRONMENT \ 233 -e PR_NUMBER \ 234 -e GITHUB_ACTIONS \ 235 -e GITHUB_REPOSITORY \ 236 -e GITHUB_WORKFLOW \ 237 -e GITHUB_JOB \ 238 -e GITHUB_RUN_ID \ 239 -e GITHUB_RUN_NUMBER \ 240 -e GITHUB_RUN_ATTEMPT \ 241 -e JOB_ID \ 242 -e JOB_NAME \ 243 -e BASE_SHA \ 244 -e BRANCH \ 245 -e SHA1 \ 246 -e AWS_DEFAULT_REGION \ 247 -e IN_WHEEL_TEST \ 248 -e SHARD_NUMBER \ 249 -e TEST_CONFIG \ 250 -e NUM_TEST_SHARDS \ 251 -e REENABLED_ISSUES \ 252 -e CONTINUE_THROUGH_ERROR \ 253 -e VERBOSE_TEST_LOGS \ 254 -e NO_TEST_TIMEOUT \ 255 -e NO_TD \ 256 -e TD_DISTRIBUTED \ 257 -e PR_LABELS \ 258 -e MAX_JOBS="$(nproc --ignore=2)" \ 259 -e SCCACHE_BUCKET \ 260 -e SCCACHE_S3_KEY_PREFIX \ 261 -e XLA_CUDA \ 262 -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ 263 -e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \ 264 -e PYTORCH_TEST_RERUN_DISABLED_TESTS \ 265 -e SKIP_SCCACHE_INITIALIZATION=1 \ 266 -e HUGGING_FACE_HUB_TOKEN \ 267 -e DASHBOARD_TAG \ 268 --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ 269 --security-opt seccomp=unconfined \ 270 --cap-add=SYS_PTRACE \ 271 --ipc=host \ 272 --shm-size="${SHM_SIZE}" \ 273 --tty \ 274 --detach \ 275 --name="${container_name}" \ 276 --user jenkins \ 277 -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ 278 -w /var/lib/jenkins/workspace \ 279 "${DOCKER_IMAGE}" 280 ) 281 # Propagate download.pytorch.org IP to container 282 grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts" 283 echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}" 284 docker exec -t "${container_name}" sh -c "pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}" 285 286 - name: Upload pytest cache if tests failed 287 uses: ./.github/actions/pytest-cache-upload 288 continue-on-error: true 289 if: failure() && steps.test.conclusion && steps.test.conclusion == 'failure' 290 with: 291 cache_dir: .pytest_cache 292 shard: ${{ matrix.shard }} 293 sha: ${{ github.event.pull_request.head.sha || github.sha }} 294 test_config: ${{ matrix.config }} 295 job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }} 296 297 - name: Print remaining test logs 298 shell: bash 299 if: always() && steps.test.conclusion 300 run: | 301 cat test/**/*_toprint.log || true 302 303 - name: Stop monitoring script 304 if: always() && steps.monitor-script.outputs.monitor-script-pid 305 shell: bash 306 continue-on-error: true 307 env: 308 MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }} 309 run: | 310 kill "$MONITOR_SCRIPT_PID" 311 312 - name: Upload test artifacts 313 uses: ./.github/actions/upload-test-artifacts 314 if: always() && steps.test.conclusion && steps.test.conclusion != 'skipped' 315 with: 316 file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }} 317 use-gha: ${{ inputs.use-gha }} 318 s3-bucket: ${{ inputs.s3-bucket }} 319 320 - name: Collect backtraces from coredumps (if any) 321 if: always() 322 run: | 323 # shellcheck disable=SC2156 324 find . -iname "core.[1-9]*" -exec docker exec "${DOCKER_CONTAINER_ID}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \; 325 326 - name: Store Core dumps on S3 327 uses: seemethere/upload-artifact-s3@v5 328 if: failure() 329 with: 330 name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }} 331 retention-days: 14 332 if-no-files-found: ignore 333 path: ./**/core.[1-9]* 334 335 - name: Teardown Linux 336 uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.4 337 if: always() 338 339 # NB: We are currently having an intermittent GPU-related issue on G5 runners with 340 # A10G GPU. Once this happens, trying to reset the GPU as done in setup-nvidia does 341 # not seem to help. Here are some symptoms: 342 # * Calling nvidia-smi timeouts after 60 second 343 # * Fail to run nvidia-smi with an unable to determine the device handle for GPU 344 # unknown error 345 # * Test fails with a missing CUDA GPU error when initializing CUDA in PyTorch 346 # * Run docker --gpus all fails with error response from daemon 347 # 348 # As both the root cause and recovery path are unclear, let's take the runner out of 349 # service so that it doesn't get any more jobs 350 - name: Check NVIDIA driver installation step 351 if: failure() && steps.install-nvidia-driver.outcome && steps.install-nvidia-driver.outcome != 'skipped' 352 shell: bash 353 env: 354 RUNNER_WORKSPACE: ${{ runner.workspace }} 355 run: | 356 set +e 357 set -x 358 359 nvidia-smi 360 # NB: Surprisingly, nvidia-smi command returns successfully with return code 0 even in 361 # the case where the driver has already crashed as it still can get the driver version 362 # and some basic information like the bus ID. However, the rest of the information 363 # would be missing (ERR!), for example: 364 # 365 # +-----------------------------------------------------------------------------+ 366 # | NVIDIA-SMI 525.89.02 Driver Version: 525.89.02 CUDA Version: 12.0 | 367 # |-------------------------------+----------------------+----------------------+ 368 # | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | 369 # | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | 370 # | | | MIG M. | 371 # |===============================+======================+======================| 372 # | 0 ERR! Off | 00000000:00:1E.0 Off | ERR! | 373 # |ERR! ERR! ERR! ERR! / ERR! | 4184MiB / 23028MiB | ERR! Default | 374 # | | | ERR! | 375 # +-------------------------------+----------------------+----------------------+ 376 # 377 # +-----------------------------------------------------------------------------+ 378 # | Processes: | 379 # | GPU GI CI PID Type Process name GPU Memory | 380 # | ID ID Usage | 381 # |=============================================================================| 382 # +-----------------------------------------------------------------------------+ 383 # 384 # This should be reported as a failure instead as it will guarantee to fail when 385 # Docker tries to run with --gpus all 386 # 387 # So, the correct check here is to query one of the missing piece of info like 388 # GPU name, so that the command can fail accordingly 389 nvidia-smi --query-gpu=gpu_name --format=csv,noheader --id=0 390 NVIDIA_SMI_STATUS=$? 391 392 # These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action 393 if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then 394 echo "NVIDIA driver installation has failed, shutting down the runner..." 395 .github/scripts/stop_runner_service.sh 396 fi 397 398 # For runner with multiple GPUs, we also want to confirm that the number of GPUs are the 399 # power of 2, i.e. 1, 2, 4, or 8. This is to avoid flaky test issue when one GPU fails 400 # https://github.com/pytorch/test-infra/issues/4000 401 GPU_COUNT=$(nvidia-smi --list-gpus | wc -l) 402 NVIDIA_SMI_STATUS=$? 403 404 # These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action 405 if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then 406 echo "NVIDIA driver installation has failed, shutting down the runner..." 407 .github/scripts/stop_runner_service.sh 408 fi 409 410 # Check the GPU count to be a power of 2 411 if [ "$GPU_COUNT" -le 8 ] && [ "$GPU_COUNT" -ne 1 ] && [ "$GPU_COUNT" -ne 2 ] && [ "$GPU_COUNT" -ne 4 ] && [ "$GPU_COUNT" -ne 8 ]; then 412 echo "NVIDIA driver detects $GPU_COUNT GPUs. The runner has a broken GPU, shutting it down..." 413 .github/scripts/stop_runner_service.sh 414 fi 415