• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1name: Setup ROCm host
2
3description: Set up ROCm host for CI
4
5runs:
6  using: composite
7  steps:
8    - name: Set DOCKER_HOST
9      shell: bash
10      run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}"
11
12    - name: Remove leftover Docker config file
13      shell: bash
14      continue-on-error: true
15      run: |
16        set -ex
17
18        cat ~/.docker/config.json || true
19        # https://stackoverflow.com/questions/64455468/error-when-logging-into-ecr-with-docker-login-error-saving-credentials-not
20        rm -f ~/.docker/config.json
21
22    - name: Stop all running docker containers
23      if: always()
24      shell: bash
25      run: |
26        # ignore expansion of "docker ps -q" since it could be empty
27        # shellcheck disable=SC2046
28        docker stop $(docker ps -q) || true
29        # Prune all stopped containers.
30        docker container prune -f
31
32    - name: Runner health check system info
33      if: always()
34      shell: bash
35      run: |
36        cat /etc/os-release || true
37        cat /etc/apt/sources.list.d/rocm.list || true
38        cat /opt/rocm/.info/version || true
39        whoami
40
41    - name: Runner health check rocm-smi
42      if: always()
43      shell: bash
44      run: |
45        rocm-smi
46
47    - name: Runner health check rocminfo
48      if: always()
49      shell: bash
50      run: |
51        rocminfo
52
53    - name: Runner health check GPU count
54      if: always()
55      shell: bash
56      run: |
57        ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
58        msg="Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
59        if [[ $ngpu -eq 0 ]]; then
60            echo "Error: Failed to detect any GPUs on the runner"
61            echo "$msg"
62            exit 1
63        fi
64        if [[ $ngpu -eq 1 ]]; then
65            echo "Error: only 1 GPU detected, at least 2 GPUs are needed for distributed jobs"
66            echo "$msg"
67            exit 1
68        fi
69
70    - name: Runner diskspace health check
71      uses: ./.github/actions/diskspace-cleanup
72      if: always()
73
74    - name: Runner health check disconnect on failure
75      if: ${{ failure() }}
76      shell: bash
77      run: |
78        killall runsvc.sh
79
80    - name: Preserve github env variables for use in docker
81      shell: bash
82      run: |
83        env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
84        env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
85
86    - name: ROCm set GPU_FLAG
87      shell: bash
88      run: |
89        # All GPUs are visible to the runner; visibility, if needed, will be set by run_test.py.
90        echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
91