1name: Setup XPU host 2 3description: Set up XPU host for CI 4 5runs: 6 using: composite 7 steps: 8 - name: Clean all stopped docker containers 9 if: always() 10 shell: bash 11 run: | 12 # Prune all stopped containers. 13 # If other runner is pruning on this node, will skip. 14 nprune=$(ps -ef | grep -c "docker container prune") 15 if [[ $nprune -eq 1 ]]; then 16 docker container prune -f 17 fi 18 19 - name: Runner health check system info 20 if: always() 21 shell: bash 22 run: | 23 cat /etc/os-release || true 24 cat /etc/apt/sources.list.d/oneAPI.list || true 25 cat /etc/apt/sources.list.d/intel-gpu-jammy.list || true 26 whoami 27 28 - name: Runner health check xpu-smi 29 if: always() 30 shell: bash 31 run: | 32 xpu-smi discovery 33 34 - name: Runner health check GPU count 35 if: always() 36 shell: bash 37 run: | 38 ngpu=$(xpu-smi discovery | grep -c -E 'Device Name') 39 msg="Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified" 40 if [[ $ngpu -eq 0 ]]; then 41 echo "Error: Failed to detect any GPUs on the runner" 42 echo "$msg" 43 exit 1 44 fi 45 46 - name: Runner diskspace health check 47 uses: ./.github/actions/diskspace-cleanup 48 if: always() 49 50 - name: Runner health check disconnect on failure 51 if: ${{ failure() }} 52 shell: bash 53 run: | 54 killall runsvc.sh 55 56 - name: Preserve github env variables for use in docker 57 shell: bash 58 run: | 59 env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}" 60 env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}" 61 62 - name: XPU set GPU_FLAG 63 shell: bash 64 run: | 65 # Add render group for container creation. 66 render_gid=`cat /etc/group | grep render | cut -d: -f3` 67 echo "GPU_FLAG=--device=/dev/mem --device=/dev/dri --group-add video --group-add $render_gid" >> "${GITHUB_ENV}" 68