1name: linux-test 2 3inputs: 4 build-environment: 5 required: true 6 type: string 7 description: Top-level label for what's being built/tested. 8 test-matrix: 9 required: true 10 type: string 11 description: JSON description of what test configs to run. 12 docker-image: 13 required: true 14 type: string 15 description: Docker image to run in. 16 sync-tag: 17 required: false 18 type: string 19 default: "" 20 description: | 21 If this is set, our linter will use this to make sure that every other 22 job with the same `sync-tag` is identical. 23 use-gha: 24 required: false 25 type: string 26 default: "" 27 description: If set to any value, upload to GHA. Otherwise upload to S3. 28 dashboard-tag: 29 required: false 30 type: string 31 default: "" 32 s3-bucket: 33 description: S3 bucket to download artifact 34 required: false 35 type: string 36 default: "gha-artifacts" 37 aws-role-to-assume: 38 description: role to assume for downloading artifacts 39 required: false 40 type: string 41 default: "" 42 HUGGING_FACE_HUB_TOKEN: 43 description: | 44 HF Auth token to avoid rate limits when downloading models or datasets from hub 45 required: false 46 default: "" 47 GITHUB_TOKEN: 48 description: GitHub token 49 required: true 50 51#env: 52# GIT_DEFAULT_BRANCH: ${{ inputs.default_branch }} 53 54runs: 55 using: composite 56 steps: 57 - name: Setup Linux 58 uses: ./.github/actions/setup-linux 59 60 - name: configure aws credentials 61 if : ${{ inputs.aws-role-to-assume != '' }} 62 uses: aws-actions/configure-aws-credentials@v3 63 with: 64 role-to-assume: ${{ inputs.aws-role-to-assume }} 65 role-session-name: gha-linux-test 66 aws-region: us-east-1 67 68 - name: Calculate docker image 69 id: calculate-docker-image 70 uses: pytorch/test-infra/.github/actions/calculate-docker-image@main 71 with: 72 docker-image-name: ${{ inputs.docker-image }} 73 74 - name: Use following to pull public copy of the image 75 id: print-ghcr-mirror 76 env: 77 ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} 78 shell: bash 79 run: | 80 tag=${ECR_DOCKER_IMAGE##*/} 81 echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}" 82 83 - name: Pull docker image 84 uses: pytorch/test-infra/.github/actions/pull-docker-image@main 85 with: 86 docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} 87 88 - name: Check if in a ARC runner 89 shell: bash 90 id: check_arc_runner 91 run: echo "IN_ARC_RUNNER=$([ -f /.inarc ] && echo true || echo false)" >> "$GITHUB_OUTPUT" 92 93 - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG 94 id: install-nvidia-driver 95 uses: pytorch/test-infra/.github/actions/setup-nvidia@main 96 if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }} 97 98 - name: Lock NVIDIA A100 40GB Frequency 99 shell: bash 100 run: | 101 sudo nvidia-smi -pm 1 102 sudo nvidia-smi -ac 1215,1410 103 nvidia-smi 104 if: contains(matrix.runner, 'a100') 105 106 - name: Start monitoring script 107 id: monitor-script 108 shell: bash 109 continue-on-error: true 110 run: | 111 python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84 112 python3 -m tools.stats.monitor > usage_log.txt 2>&1 & 113 echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}" 114 115 - name: Download build artifacts 116 uses: ./.github/actions/download-build-artifacts 117 with: 118 name: ${{ inputs.build-environment }} 119 s3-bucket: ${{ inputs.s3-bucket }} 120 121 - name: Download TD artifacts 122 continue-on-error: true 123 uses: ./.github/actions/download-td-artifacts 124 125 - name: Parse ref 126 id: parse-ref 127 shell: bash 128 run: .github/scripts/parse_ref.py 129 130 - name: Get workflow job id 131 id: get-job-id 132 uses: ./.github/actions/get-workflow-job-id 133 if: always() 134 with: 135 github-token: ${{ inputs.GITHUB_TOKEN }} 136 137 - name: Check for keep-going label and re-enabled test issues 138 # This uses the filter-test-configs action because it conviniently 139 # checks for labels and re-enabled test issues. It does not actually do 140 # any filtering. All filtering is done in the build step. 141 id: keep-going 142 uses: ./.github/actions/filter-test-configs 143 with: 144 github-token: ${{ inputs.GITHUB_TOKEN }} 145 test-matrix: ${{ inputs.test-matrix }} 146 job-name: ${{ steps.get-job-id.outputs.job-name }} 147 148 - name: Test 149 id: test 150 env: 151 BUILD_ENVIRONMENT: ${{ inputs.build-environment }} 152 PR_NUMBER: ${{ github.event.pull_request.number }} 153 GITHUB_REPOSITORY: ${{ github.repository }} 154 GITHUB_WORKFLOW: ${{ github.workflow }} 155 GITHUB_JOB: ${{ github.job }} 156 GITHUB_RUN_ID: ${{ github.run_id }} 157 GITHUB_RUN_NUMBER: ${{ github.run_number }} 158 GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }} 159 JOB_ID: ${{ steps.get-job-id.outputs.job-id }} 160 JOB_NAME: ${{ steps.get-job-id.outputs.job-name }} 161 BRANCH: ${{ steps.parse-ref.outputs.branch }} 162 SHA1: ${{ github.event.pull_request.head.sha || github.sha }} 163 BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }} 164 TEST_CONFIG: ${{ matrix.config }} 165 SHARD_NUMBER: ${{ matrix.shard }} 166 NUM_TEST_SHARDS: ${{ matrix.num_shards }} 167 REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }} 168 CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }} 169 VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }} 170 TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }} 171 NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }} 172 NO_TD: ${{ steps.keep-going.outputs.ci-no-td }} 173 TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }} 174 SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 175 SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }} 176 SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }} 177 DOCKER_IMAGE: ${{ inputs.docker-image }} 178 XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }} 179 XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla 180 PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }} 181 PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }} 182 DASHBOARD_TAG: ${{ inputs.dashboard-tag }} 183 HUGGING_FACE_HUB_TOKEN: ${{ inputs.HUGGING_FACE_HUB_TOKEN }} 184 shell: bash 185 run: | 186 set -x 187 188 if [[ $TEST_CONFIG == 'multigpu' ]]; then 189 TEST_COMMAND=.ci/pytorch/multigpu-test.sh 190 elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then 191 TEST_COMMAND=.ci/onnx/test.sh 192 else 193 TEST_COMMAND=.ci/pytorch/test.sh 194 fi 195 196 # detached container should get cleaned up by teardown_ec2_linux 197 # TODO: Stop building test binaries as part of the build phase 198 # Used for GPU_FLAG since that doesn't play nice 199 # shellcheck disable=SC2086,SC2090 200 container_name=$(docker run \ 201 ${GPU_FLAG:-} \ 202 -e BUILD_ENVIRONMENT \ 203 -e PR_NUMBER \ 204 -e GITHUB_ACTIONS \ 205 -e GITHUB_REPOSITORY \ 206 -e GITHUB_WORKFLOW \ 207 -e GITHUB_JOB \ 208 -e GITHUB_RUN_ID \ 209 -e GITHUB_RUN_NUMBER \ 210 -e GITHUB_RUN_ATTEMPT \ 211 -e JOB_ID \ 212 -e JOB_NAME \ 213 -e BASE_SHA \ 214 -e BRANCH \ 215 -e SHA1 \ 216 -e AWS_DEFAULT_REGION \ 217 -e IN_WHEEL_TEST \ 218 -e SHARD_NUMBER \ 219 -e TEST_CONFIG \ 220 -e NUM_TEST_SHARDS \ 221 -e REENABLED_ISSUES \ 222 -e CONTINUE_THROUGH_ERROR \ 223 -e VERBOSE_TEST_LOGS \ 224 -e NO_TEST_TIMEOUT \ 225 -e NO_TD \ 226 -e TD_DISTRIBUTED \ 227 -e PR_LABELS \ 228 -e MAX_JOBS="$(nproc --ignore=2)" \ 229 -e SCCACHE_BUCKET \ 230 -e SCCACHE_S3_KEY_PREFIX \ 231 -e XLA_CUDA \ 232 -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ 233 -e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \ 234 -e PYTORCH_TEST_RERUN_DISABLED_TESTS \ 235 -e SKIP_SCCACHE_INITIALIZATION=1 \ 236 -e HUGGING_FACE_HUB_TOKEN \ 237 -e DASHBOARD_TAG \ 238 --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ 239 --security-opt seccomp=unconfined \ 240 --cap-add=SYS_PTRACE \ 241 --ipc=host \ 242 --shm-size="${SHM_SIZE}" \ 243 --tty \ 244 --detach \ 245 --name="${container_name}" \ 246 --user jenkins \ 247 -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ 248 -w /var/lib/jenkins/workspace \ 249 "${DOCKER_IMAGE}" 250 ) 251 # Propagate download.pytorch.org IP to container 252 grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts" 253 echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}" 254 docker exec -t "${container_name}" sh -c "pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}" 255 256 - name: Upload pytest cache if tests failed 257 uses: ./.github/actions/pytest-cache-upload 258 continue-on-error: true 259 if: failure() && steps.test.conclusion && steps.test.conclusion == 'failure' 260 with: 261 cache_dir: .pytest_cache 262 shard: ${{ matrix.shard }} 263 sha: ${{ github.event.pull_request.head.sha || github.sha }} 264 test_config: ${{ matrix.config }} 265 job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }} 266 267 - name: Print remaining test logs 268 shell: bash 269 if: always() && steps.test.conclusion 270 run: | 271 cat test/**/*_toprint.log || true 272 273 - name: Stop monitoring script 274 if: always() && steps.monitor-script.outputs.monitor-script-pid 275 shell: bash 276 continue-on-error: true 277 env: 278 MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }} 279 run: | 280 kill "$MONITOR_SCRIPT_PID" 281 282 - name: Upload test artifacts 283 uses: ./.github/actions/upload-test-artifacts 284 if: always() && steps.test.conclusion && steps.test.conclusion != 'skipped' 285 with: 286 file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }} 287 use-gha: ${{ inputs.use-gha }} 288 s3-bucket: ${{ inputs.s3-bucket }} 289 290 - name: Collect backtraces from coredumps (if any) 291 if: always() 292 shell: bash 293 run: | 294 # shellcheck disable=SC2156 295 find . -iname "core.[1-9]*" -exec docker exec "${DOCKER_CONTAINER_ID}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \; 296 297 - name: Store Core dumps on S3 298 uses: seemethere/upload-artifact-s3@v5 299 if: failure() 300 with: 301 name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }} 302 retention-days: 14 303 if-no-files-found: ignore 304 path: ./**/core.[1-9]* 305 306 - name: Teardown Linux 307 uses: pytorch/test-infra/.github/actions/teardown-linux@main 308 if: always() 309 310 # NB: We are currently having an intermittent GPU-related issue on G5 runners with 311 # A10G GPU. Once this happens, trying to reset the GPU as done in setup-nvidia does 312 # not seem to help. Here are some symptoms: 313 # * Calling nvidia-smi timeouts after 60 second 314 # * Fail to run nvidia-smi with an unable to determine the device handle for GPU 315 # unknown error 316 # * Test fails with a missing CUDA GPU error when initializing CUDA in PyTorch 317 # * Run docker --gpus all fails with error response from daemon 318 # 319 # As both the root cause and recovery path are unclear, let's take the runner out of 320 # service so that it doesn't get any more jobs 321 - name: Check NVIDIA driver installation step 322 if: failure() && steps.install-nvidia-driver.outcome && steps.install-nvidia-driver.outcome != 'skipped' 323 shell: bash 324 env: 325 RUNNER_WORKSPACE: ${{ runner.workspace }} 326 run: | 327 set +e 328 set -x 329 330 nvidia-smi 331 # NB: Surprisingly, nvidia-smi command returns successfully with return code 0 even in 332 # the case where the driver has already crashed as it still can get the driver version 333 # and some basic information like the bus ID. However, the rest of the information 334 # would be missing (ERR!), for example: 335 # 336 # +-----------------------------------------------------------------------------+ 337 # | NVIDIA-SMI 525.89.02 Driver Version: 525.89.02 CUDA Version: 12.0 | 338 # |-------------------------------+----------------------+----------------------+ 339 # | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | 340 # | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | 341 # | | | MIG M. | 342 # |===============================+======================+======================| 343 # | 0 ERR! Off | 00000000:00:1E.0 Off | ERR! | 344 # |ERR! ERR! ERR! ERR! / ERR! | 4184MiB / 23028MiB | ERR! Default | 345 # | | | ERR! | 346 # +-------------------------------+----------------------+----------------------+ 347 # 348 # +-----------------------------------------------------------------------------+ 349 # | Processes: | 350 # | GPU GI CI PID Type Process name GPU Memory | 351 # | ID ID Usage | 352 # |=============================================================================| 353 # +-----------------------------------------------------------------------------+ 354 # 355 # This should be reported as a failure instead as it will guarantee to fail when 356 # Docker tries to run with --gpus all 357 # 358 # So, the correct check here is to query one of the missing piece of info like 359 # GPU name, so that the command can fail accordingly 360 nvidia-smi --query-gpu=gpu_name --format=csv,noheader --id=0 361 NVIDIA_SMI_STATUS=$? 362 363 # These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action 364 if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then 365 echo "NVIDIA driver installation has failed, shutting down the runner..." 366 .github/scripts/stop_runner_service.sh 367 fi 368 369 # For runner with multiple GPUs, we also want to confirm that the number of GPUs are the 370 # power of 2, i.e. 1, 2, 4, or 8. This is to avoid flaky test issue when one GPU fails 371 # https://github.com/pytorch/test-infra/issues/4000 372 GPU_COUNT=$(nvidia-smi --list-gpus | wc -l) 373 NVIDIA_SMI_STATUS=$? 374 375 # These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action 376 if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then 377 echo "NVIDIA driver installation has failed, shutting down the runner..." 378 .github/scripts/stop_runner_service.sh 379 fi 380 381 # Check the GPU count to be a power of 2 382 if [ "$GPU_COUNT" -le 8 ] && [ "$GPU_COUNT" -ne 1 ] && [ "$GPU_COUNT" -ne 2 ] && [ "$GPU_COUNT" -ne 4 ] && [ "$GPU_COUNT" -ne 8 ]; then 383 echo "NVIDIA driver detects $GPU_COUNT GPUs. The runner has a broken GPU, shutting it down..." 384 .github/scripts/stop_runner_service.sh 385 fi 386