actions/linux-test/action.yml

name: linux-test

inputs:
  build-environment:
    required: true
    type: string
    description: Top-level label for what's being built/tested.
  test-matrix:
    required: true
    type: string
    description: JSON description of what test configs to run.
  docker-image:
    required: true
    type: string
    description: Docker image to run in.
  sync-tag:
    required: false
    type: string
    default: ""
    description: |
      If this is set, our linter will use this to make sure that every other
      job with the same `sync-tag` is identical.
  use-gha:
    required: false
    type: string
    default: ""
    description: If set to any value, upload to GHA. Otherwise upload to S3.
  dashboard-tag:
    required: false
    type: string
    default: ""
  s3-bucket:
    description: S3 bucket to download artifact
    required: false
    type: string
    default: "gha-artifacts"
  aws-role-to-assume:
    description: role to assume for downloading artifacts
    required: false
    type: string
    default: ""
  HUGGING_FACE_HUB_TOKEN:
    description: |
      HF Auth token to avoid rate limits when downloading models or datasets from hub
    required: false
    default: ""
  GITHUB_TOKEN:
    description: GitHub token
    required: true

#env:
#  GIT_DEFAULT_BRANCH: ${{ inputs.default_branch }}

runs:
  using: composite
  steps:
    - name: Setup Linux
      uses: ./.github/actions/setup-linux

    - name: configure aws credentials
      if : ${{ inputs.aws-role-to-assume != '' }}
      uses: aws-actions/configure-aws-credentials@v3
      with:
        role-to-assume: ${{ inputs.aws-role-to-assume }}
        role-session-name: gha-linux-test
        aws-region: us-east-1

    - name: Calculate docker image
      id: calculate-docker-image
      uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
      with:
        docker-image-name: ${{ inputs.docker-image }}

    - name: Use following to pull public copy of the image
      id: print-ghcr-mirror
      env:
        ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
      shell: bash
      run: |
        tag=${ECR_DOCKER_IMAGE##*/}
        echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"

    - name: Pull docker image
      uses: pytorch/test-infra/.github/actions/pull-docker-image@main
      with:
        docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

    - name: Check if in a ARC runner
      shell: bash
      id: check_arc_runner
      run: echo "IN_ARC_RUNNER=$([ -f /.inarc ] && echo true || echo false)" >> "$GITHUB_OUTPUT"

    - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
      id: install-nvidia-driver
      uses: pytorch/test-infra/.github/actions/setup-nvidia@main
      if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}

    - name: Lock NVIDIA A100 40GB Frequency
      shell: bash
      run: |
        sudo nvidia-smi -pm 1
        sudo nvidia-smi -ac 1215,1410
        nvidia-smi
      if: contains(matrix.runner, 'a100')

    - name: Start monitoring script
      id: monitor-script
      shell: bash
      continue-on-error: true
      run: |
        python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84
        python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
        echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"

    - name: Download build artifacts
      uses: ./.github/actions/download-build-artifacts
      with:
        name: ${{ inputs.build-environment }}
        s3-bucket: ${{ inputs.s3-bucket }}

    - name: Download TD artifacts
      continue-on-error: true
      uses: ./.github/actions/download-td-artifacts

    - name: Parse ref
      id: parse-ref
      shell: bash
      run: .github/scripts/parse_ref.py

    - name: Get workflow job id
      id: get-job-id
      uses: ./.github/actions/get-workflow-job-id
      if: always()
      with:
        github-token: ${{ inputs.GITHUB_TOKEN }}

    - name: Check for keep-going label and re-enabled test issues
      # This uses the filter-test-configs action because it conviniently
      # checks for labels and re-enabled test issues.  It does not actually do
      # any filtering.  All filtering is done in the build step.
      id: keep-going
      uses: ./.github/actions/filter-test-configs
      with:
        github-token: ${{ inputs.GITHUB_TOKEN }}
        test-matrix: ${{ inputs.test-matrix }}
        job-name: ${{ steps.get-job-id.outputs.job-name }}

    - name: Test
      id: test
      env:
        BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
        PR_NUMBER: ${{ github.event.pull_request.number }}
        GITHUB_REPOSITORY: ${{ github.repository }}
        GITHUB_WORKFLOW: ${{ github.workflow }}
        GITHUB_JOB: ${{ github.job }}
        GITHUB_RUN_ID: ${{ github.run_id }}
        GITHUB_RUN_NUMBER: ${{ github.run_number }}
        GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
        JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
        JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
        BRANCH: ${{ steps.parse-ref.outputs.branch }}
        SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
        BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }}
        TEST_CONFIG: ${{ matrix.config }}
        SHARD_NUMBER: ${{ matrix.shard }}
        NUM_TEST_SHARDS: ${{ matrix.num_shards }}
        REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
        CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
        VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
        TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}
        NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
        NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
        TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }}
        SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
        SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
        SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
        DOCKER_IMAGE: ${{ inputs.docker-image }}
        XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
        XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
        PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
        PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
        DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
        HUGGING_FACE_HUB_TOKEN: ${{ inputs.HUGGING_FACE_HUB_TOKEN }}
      shell: bash
      run: |
        set -x

        if [[ $TEST_CONFIG == 'multigpu' ]]; then
          TEST_COMMAND=.ci/pytorch/multigpu-test.sh
        elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
          TEST_COMMAND=.ci/onnx/test.sh
        else
          TEST_COMMAND=.ci/pytorch/test.sh
        fi

        # detached container should get cleaned up by teardown_ec2_linux
        # TODO: Stop building test binaries as part of the build phase
        # Used for GPU_FLAG since that doesn't play nice
        # shellcheck disable=SC2086,SC2090
        container_name=$(docker run \
          ${GPU_FLAG:-} \
          -e BUILD_ENVIRONMENT \
          -e PR_NUMBER \
          -e GITHUB_ACTIONS \
          -e GITHUB_REPOSITORY \
          -e GITHUB_WORKFLOW \
          -e GITHUB_JOB \
          -e GITHUB_RUN_ID \
          -e GITHUB_RUN_NUMBER \
          -e GITHUB_RUN_ATTEMPT \
          -e JOB_ID \
          -e JOB_NAME \
          -e BASE_SHA \
          -e BRANCH \
          -e SHA1 \
          -e AWS_DEFAULT_REGION \
          -e IN_WHEEL_TEST \
          -e SHARD_NUMBER \
          -e TEST_CONFIG \
          -e NUM_TEST_SHARDS \
          -e REENABLED_ISSUES \
          -e CONTINUE_THROUGH_ERROR \
          -e VERBOSE_TEST_LOGS \
          -e NO_TEST_TIMEOUT \
          -e NO_TD \
          -e TD_DISTRIBUTED \
          -e PR_LABELS \
          -e MAX_JOBS="$(nproc --ignore=2)" \
          -e SCCACHE_BUCKET \
          -e SCCACHE_S3_KEY_PREFIX \
          -e XLA_CUDA \
          -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
          -e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
          -e PYTORCH_TEST_RERUN_DISABLED_TESTS \
          -e SKIP_SCCACHE_INITIALIZATION=1 \
          -e HUGGING_FACE_HUB_TOKEN \
          -e DASHBOARD_TAG \
          --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
          --security-opt seccomp=unconfined \
          --cap-add=SYS_PTRACE \
          --ipc=host \
          --shm-size="${SHM_SIZE}" \
          --tty \
          --detach \
          --name="${container_name}" \
          --user jenkins \
          -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
          -w /var/lib/jenkins/workspace \
          "${DOCKER_IMAGE}"
        )
        # Propagate download.pytorch.org IP to container
        grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts"
        echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
        docker exec -t "${container_name}" sh -c "pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}"

    - name: Upload pytest cache if tests failed
      uses: ./.github/actions/pytest-cache-upload
      continue-on-error: true
      if: failure() && steps.test.conclusion && steps.test.conclusion == 'failure'
      with:
        cache_dir: .pytest_cache
        shard: ${{ matrix.shard }}
        sha: ${{ github.event.pull_request.head.sha || github.sha }}
        test_config: ${{ matrix.config }}
        job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}

    - name: Print remaining test logs
      shell: bash
      if: always() && steps.test.conclusion
      run: |
        cat test/**/*_toprint.log || true

    - name: Stop monitoring script
      if: always() && steps.monitor-script.outputs.monitor-script-pid
      shell: bash
      continue-on-error: true
      env:
        MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }}
      run: |
        kill "$MONITOR_SCRIPT_PID"

    - name: Upload test artifacts
      uses: ./.github/actions/upload-test-artifacts
      if: always() && steps.test.conclusion && steps.test.conclusion != 'skipped'
      with:
        file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
        use-gha: ${{ inputs.use-gha }}
        s3-bucket: ${{ inputs.s3-bucket }}

    - name: Collect backtraces from coredumps (if any)
      if: always()
      shell: bash
      run: |
        # shellcheck disable=SC2156
        find . -iname "core.[1-9]*" -exec docker exec "${DOCKER_CONTAINER_ID}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;

    - name: Store Core dumps on S3
      uses: seemethere/upload-artifact-s3@v5
      if: failure()
      with:
        name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}
        retention-days: 14
        if-no-files-found: ignore
        path: ./**/core.[1-9]*

    - name: Teardown Linux
      uses: pytorch/test-infra/.github/actions/teardown-linux@main
      if: always()

    # NB: We are currently having an intermittent GPU-related issue on G5 runners with
    # A10G GPU. Once this happens, trying to reset the GPU as done in setup-nvidia does
    # not seem to help. Here are some symptoms:
    #   * Calling nvidia-smi timeouts after 60 second
    #   * Fail to run nvidia-smi with an unable to determine the device handle for GPU
    #     unknown error
    #   * Test fails with a missing CUDA GPU error when initializing CUDA in PyTorch
    #   * Run docker --gpus all fails with error response from daemon
    #
    # As both the root cause and recovery path are unclear, let's take the runner out of
    # service so that it doesn't get any more jobs
    - name: Check NVIDIA driver installation step
      if: failure() && steps.install-nvidia-driver.outcome && steps.install-nvidia-driver.outcome != 'skipped'
      shell: bash
      env:
        RUNNER_WORKSPACE: ${{ runner.workspace }}
      run: |
        set +e
        set -x

        nvidia-smi
        # NB: Surprisingly, nvidia-smi command returns successfully with return code 0 even in
        # the case where the driver has already crashed as it still can get the driver version
        # and some basic information like the bus ID.  However, the rest of the information
        # would be missing (ERR!), for example:
        #
        # +-----------------------------------------------------------------------------+
        # | NVIDIA-SMI 525.89.02    Driver Version: 525.89.02    CUDA Version: 12.0     |
        # |-------------------------------+----------------------+----------------------+
        # | GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
        # | Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
        # |                               |                      |               MIG M. |
        # |===============================+======================+======================|
        # |   0  ERR!                Off  | 00000000:00:1E.0 Off |                 ERR! |
        # |ERR!  ERR! ERR!    ERR! / ERR! |   4184MiB / 23028MiB |    ERR!      Default |
        # |                               |                      |                 ERR! |
        # +-------------------------------+----------------------+----------------------+
        #
        # +-----------------------------------------------------------------------------+
        # | Processes:                                                                  |
        # |  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
        # |        ID   ID                                                   Usage      |
        # |=============================================================================|
        # +-----------------------------------------------------------------------------+
        #
        # This should be reported as a failure instead as it will guarantee to fail when
        # Docker tries to run with --gpus all
        #
        # So, the correct check here is to query one of the missing piece of info like
        # GPU name, so that the command can fail accordingly
        nvidia-smi --query-gpu=gpu_name --format=csv,noheader --id=0
        NVIDIA_SMI_STATUS=$?

        # These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action
        if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then
          echo "NVIDIA driver installation has failed, shutting down the runner..."
          .github/scripts/stop_runner_service.sh
        fi

        # For runner with multiple GPUs, we also want to confirm that the number of GPUs are the
        # power of 2, i.e. 1, 2, 4, or 8. This is to avoid flaky test issue when one GPU fails
        # https://github.com/pytorch/test-infra/issues/4000
        GPU_COUNT=$(nvidia-smi --list-gpus | wc -l)
        NVIDIA_SMI_STATUS=$?

        # These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action
        if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then
          echo "NVIDIA driver installation has failed, shutting down the runner..."
          .github/scripts/stop_runner_service.sh
        fi

        # Check the GPU count to be a power of 2
        if [ "$GPU_COUNT" -le 8 ] && [ "$GPU_COUNT" -ne 1 ] && [ "$GPU_COUNT" -ne 2 ] && [ "$GPU_COUNT" -ne 4 ] && [ "$GPU_COUNT" -ne 8 ]; then
          echo "NVIDIA driver detects $GPU_COUNT GPUs. The runner has a broken GPU, shutting it down..."
          .github/scripts/stop_runner_service.sh
        fi