1name: Setup ROCm host 2 3description: Set up ROCm host for CI 4 5runs: 6 using: composite 7 steps: 8 - name: Set DOCKER_HOST 9 shell: bash 10 run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}" 11 12 - name: Remove leftover Docker config file 13 shell: bash 14 continue-on-error: true 15 run: | 16 set -ex 17 18 cat ~/.docker/config.json || true 19 # https://stackoverflow.com/questions/64455468/error-when-logging-into-ecr-with-docker-login-error-saving-credentials-not 20 rm -f ~/.docker/config.json 21 22 - name: Stop all running docker containers 23 if: always() 24 shell: bash 25 run: | 26 # ignore expansion of "docker ps -q" since it could be empty 27 # shellcheck disable=SC2046 28 docker stop $(docker ps -q) || true 29 # Prune all stopped containers. 30 docker container prune -f 31 32 - name: Runner health check system info 33 if: always() 34 shell: bash 35 run: | 36 cat /etc/os-release || true 37 cat /etc/apt/sources.list.d/rocm.list || true 38 cat /opt/rocm/.info/version || true 39 whoami 40 41 - name: Runner health check rocm-smi 42 if: always() 43 shell: bash 44 run: | 45 rocm-smi 46 47 - name: Runner health check rocminfo 48 if: always() 49 shell: bash 50 run: | 51 rocminfo 52 53 - name: Runner health check GPU count 54 if: always() 55 shell: bash 56 run: | 57 ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx') 58 msg="Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified" 59 if [[ $ngpu -eq 0 ]]; then 60 echo "Error: Failed to detect any GPUs on the runner" 61 echo "$msg" 62 exit 1 63 fi 64 if [[ $ngpu -eq 1 ]]; then 65 echo "Error: only 1 GPU detected, at least 2 GPUs are needed for distributed jobs" 66 echo "$msg" 67 exit 1 68 fi 69 70 - name: Runner diskspace health check 71 uses: ./.github/actions/diskspace-cleanup 72 if: always() 73 74 - name: Runner health check disconnect on failure 75 if: ${{ failure() }} 76 shell: bash 77 run: | 78 killall runsvc.sh 79 80 - name: Preserve github env variables for use in docker 81 shell: bash 82 run: | 83 env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}" 84 env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}" 85 86 - name: ROCm set GPU_FLAG 87 shell: bash 88 run: | 89 # All GPUs are visible to the runner; visibility, if needed, will be set by run_test.py. 90 echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" 91