name: linux-test inputs: build-environment: required: true type: string description: Top-level label for what's being built/tested. test-matrix: required: true type: string description: JSON description of what test configs to run. docker-image: required: true type: string description: Docker image to run in. sync-tag: required: false type: string default: "" description: | If this is set, our linter will use this to make sure that every other job with the same `sync-tag` is identical. use-gha: required: false type: string default: "" description: If set to any value, upload to GHA. Otherwise upload to S3. dashboard-tag: required: false type: string default: "" s3-bucket: description: S3 bucket to download artifact required: false type: string default: "gha-artifacts" aws-role-to-assume: description: role to assume for downloading artifacts required: false type: string default: "" HUGGING_FACE_HUB_TOKEN: description: | HF Auth token to avoid rate limits when downloading models or datasets from hub required: false default: "" GITHUB_TOKEN: description: GitHub token required: true #env: # GIT_DEFAULT_BRANCH: ${{ inputs.default_branch }} runs: using: composite steps: - name: Setup Linux uses: ./.github/actions/setup-linux - name: configure aws credentials if : ${{ inputs.aws-role-to-assume != '' }} uses: aws-actions/configure-aws-credentials@v3 with: role-to-assume: ${{ inputs.aws-role-to-assume }} role-session-name: gha-linux-test aws-region: us-east-1 - name: Calculate docker image id: calculate-docker-image uses: pytorch/test-infra/.github/actions/calculate-docker-image@main with: docker-image-name: ${{ inputs.docker-image }} - name: Use following to pull public copy of the image id: print-ghcr-mirror env: ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} shell: bash run: | tag=${ECR_DOCKER_IMAGE##*/} echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}" - name: Pull docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@main with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Check if in a ARC runner shell: bash id: check_arc_runner run: echo "IN_ARC_RUNNER=$([ -f /.inarc ] && echo true || echo false)" >> "$GITHUB_OUTPUT" - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG id: install-nvidia-driver uses: pytorch/test-infra/.github/actions/setup-nvidia@main if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }} - name: Lock NVIDIA A100 40GB Frequency shell: bash run: | sudo nvidia-smi -pm 1 sudo nvidia-smi -ac 1215,1410 nvidia-smi if: contains(matrix.runner, 'a100') - name: Start monitoring script id: monitor-script shell: bash continue-on-error: true run: | python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84 python3 -m tools.stats.monitor > usage_log.txt 2>&1 & echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}" - name: Download build artifacts uses: ./.github/actions/download-build-artifacts with: name: ${{ inputs.build-environment }} s3-bucket: ${{ inputs.s3-bucket }} - name: Download TD artifacts continue-on-error: true uses: ./.github/actions/download-td-artifacts - name: Parse ref id: parse-ref shell: bash run: .github/scripts/parse_ref.py - name: Get workflow job id id: get-job-id uses: ./.github/actions/get-workflow-job-id if: always() with: github-token: ${{ inputs.GITHUB_TOKEN }} - name: Check for keep-going label and re-enabled test issues # This uses the filter-test-configs action because it conviniently # checks for labels and re-enabled test issues. It does not actually do # any filtering. All filtering is done in the build step. id: keep-going uses: ./.github/actions/filter-test-configs with: github-token: ${{ inputs.GITHUB_TOKEN }} test-matrix: ${{ inputs.test-matrix }} job-name: ${{ steps.get-job-id.outputs.job-name }} - name: Test id: test env: BUILD_ENVIRONMENT: ${{ inputs.build-environment }} PR_NUMBER: ${{ github.event.pull_request.number }} GITHUB_REPOSITORY: ${{ github.repository }} GITHUB_WORKFLOW: ${{ github.workflow }} GITHUB_JOB: ${{ github.job }} GITHUB_RUN_ID: ${{ github.run_id }} GITHUB_RUN_NUMBER: ${{ github.run_number }} GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }} JOB_ID: ${{ steps.get-job-id.outputs.job-id }} JOB_NAME: ${{ steps.get-job-id.outputs.job-name }} BRANCH: ${{ steps.parse-ref.outputs.branch }} SHA1: ${{ github.event.pull_request.head.sha || github.sha }} BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }} TEST_CONFIG: ${{ matrix.config }} SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }} CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }} VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }} TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }} NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }} NO_TD: ${{ steps.keep-going.outputs.ci-no-td }} TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }} SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }} SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }} DOCKER_IMAGE: ${{ inputs.docker-image }} XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }} XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }} PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }} DASHBOARD_TAG: ${{ inputs.dashboard-tag }} HUGGING_FACE_HUB_TOKEN: ${{ inputs.HUGGING_FACE_HUB_TOKEN }} shell: bash run: | set -x if [[ $TEST_CONFIG == 'multigpu' ]]; then TEST_COMMAND=.ci/pytorch/multigpu-test.sh elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then TEST_COMMAND=.ci/onnx/test.sh else TEST_COMMAND=.ci/pytorch/test.sh fi # detached container should get cleaned up by teardown_ec2_linux # TODO: Stop building test binaries as part of the build phase # Used for GPU_FLAG since that doesn't play nice # shellcheck disable=SC2086,SC2090 container_name=$(docker run \ ${GPU_FLAG:-} \ -e BUILD_ENVIRONMENT \ -e PR_NUMBER \ -e GITHUB_ACTIONS \ -e GITHUB_REPOSITORY \ -e GITHUB_WORKFLOW \ -e GITHUB_JOB \ -e GITHUB_RUN_ID \ -e GITHUB_RUN_NUMBER \ -e GITHUB_RUN_ATTEMPT \ -e JOB_ID \ -e JOB_NAME \ -e BASE_SHA \ -e BRANCH \ -e SHA1 \ -e AWS_DEFAULT_REGION \ -e IN_WHEEL_TEST \ -e SHARD_NUMBER \ -e TEST_CONFIG \ -e NUM_TEST_SHARDS \ -e REENABLED_ISSUES \ -e CONTINUE_THROUGH_ERROR \ -e VERBOSE_TEST_LOGS \ -e NO_TEST_TIMEOUT \ -e NO_TD \ -e TD_DISTRIBUTED \ -e PR_LABELS \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ -e SCCACHE_S3_KEY_PREFIX \ -e XLA_CUDA \ -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ -e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \ -e PYTORCH_TEST_RERUN_DISABLED_TESTS \ -e SKIP_SCCACHE_INITIALIZATION=1 \ -e HUGGING_FACE_HUB_TOKEN \ -e DASHBOARD_TAG \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ --ipc=host \ --shm-size="${SHM_SIZE}" \ --tty \ --detach \ --name="${container_name}" \ --user jenkins \ -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ -w /var/lib/jenkins/workspace \ "${DOCKER_IMAGE}" ) # Propagate download.pytorch.org IP to container grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts" echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}" docker exec -t "${container_name}" sh -c "pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}" - name: Upload pytest cache if tests failed uses: ./.github/actions/pytest-cache-upload continue-on-error: true if: failure() && steps.test.conclusion && steps.test.conclusion == 'failure' with: cache_dir: .pytest_cache shard: ${{ matrix.shard }} sha: ${{ github.event.pull_request.head.sha || github.sha }} test_config: ${{ matrix.config }} job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }} - name: Print remaining test logs shell: bash if: always() && steps.test.conclusion run: | cat test/**/*_toprint.log || true - name: Stop monitoring script if: always() && steps.monitor-script.outputs.monitor-script-pid shell: bash continue-on-error: true env: MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }} run: | kill "$MONITOR_SCRIPT_PID" - name: Upload test artifacts uses: ./.github/actions/upload-test-artifacts if: always() && steps.test.conclusion && steps.test.conclusion != 'skipped' with: file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }} use-gha: ${{ inputs.use-gha }} s3-bucket: ${{ inputs.s3-bucket }} - name: Collect backtraces from coredumps (if any) if: always() shell: bash run: | # shellcheck disable=SC2156 find . -iname "core.[1-9]*" -exec docker exec "${DOCKER_CONTAINER_ID}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \; - name: Store Core dumps on S3 uses: seemethere/upload-artifact-s3@v5 if: failure() with: name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }} retention-days: 14 if-no-files-found: ignore path: ./**/core.[1-9]* - name: Teardown Linux uses: pytorch/test-infra/.github/actions/teardown-linux@main if: always() # NB: We are currently having an intermittent GPU-related issue on G5 runners with # A10G GPU. Once this happens, trying to reset the GPU as done in setup-nvidia does # not seem to help. Here are some symptoms: # * Calling nvidia-smi timeouts after 60 second # * Fail to run nvidia-smi with an unable to determine the device handle for GPU # unknown error # * Test fails with a missing CUDA GPU error when initializing CUDA in PyTorch # * Run docker --gpus all fails with error response from daemon # # As both the root cause and recovery path are unclear, let's take the runner out of # service so that it doesn't get any more jobs - name: Check NVIDIA driver installation step if: failure() && steps.install-nvidia-driver.outcome && steps.install-nvidia-driver.outcome != 'skipped' shell: bash env: RUNNER_WORKSPACE: ${{ runner.workspace }} run: | set +e set -x nvidia-smi # NB: Surprisingly, nvidia-smi command returns successfully with return code 0 even in # the case where the driver has already crashed as it still can get the driver version # and some basic information like the bus ID. However, the rest of the information # would be missing (ERR!), for example: # # +-----------------------------------------------------------------------------+ # | NVIDIA-SMI 525.89.02 Driver Version: 525.89.02 CUDA Version: 12.0 | # |-------------------------------+----------------------+----------------------+ # | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | # | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | # | | | MIG M. | # |===============================+======================+======================| # | 0 ERR! Off | 00000000:00:1E.0 Off | ERR! | # |ERR! ERR! ERR! ERR! / ERR! | 4184MiB / 23028MiB | ERR! Default | # | | | ERR! | # +-------------------------------+----------------------+----------------------+ # # +-----------------------------------------------------------------------------+ # | Processes: | # | GPU GI CI PID Type Process name GPU Memory | # | ID ID Usage | # |=============================================================================| # +-----------------------------------------------------------------------------+ # # This should be reported as a failure instead as it will guarantee to fail when # Docker tries to run with --gpus all # # So, the correct check here is to query one of the missing piece of info like # GPU name, so that the command can fail accordingly nvidia-smi --query-gpu=gpu_name --format=csv,noheader --id=0 NVIDIA_SMI_STATUS=$? # These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then echo "NVIDIA driver installation has failed, shutting down the runner..." .github/scripts/stop_runner_service.sh fi # For runner with multiple GPUs, we also want to confirm that the number of GPUs are the # power of 2, i.e. 1, 2, 4, or 8. This is to avoid flaky test issue when one GPU fails # https://github.com/pytorch/test-infra/issues/4000 GPU_COUNT=$(nvidia-smi --list-gpus | wc -l) NVIDIA_SMI_STATUS=$? # These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then echo "NVIDIA driver installation has failed, shutting down the runner..." .github/scripts/stop_runner_service.sh fi # Check the GPU count to be a power of 2 if [ "$GPU_COUNT" -le 8 ] && [ "$GPU_COUNT" -ne 1 ] && [ "$GPU_COUNT" -ne 2 ] && [ "$GPU_COUNT" -ne 4 ] && [ "$GPU_COUNT" -ne 8 ]; then echo "NVIDIA driver detects $GPU_COUNT GPUs. The runner has a broken GPU, shutting it down..." .github/scripts/stop_runner_service.sh fi