name: Setup XPU host description: Set up XPU host for CI runs: using: composite steps: - name: Clean all stopped docker containers if: always() shell: bash run: | # Prune all stopped containers. # If other runner is pruning on this node, will skip. nprune=$(ps -ef | grep -c "docker container prune") if [[ $nprune -eq 1 ]]; then docker container prune -f fi - name: Runner health check system info if: always() shell: bash run: | cat /etc/os-release || true cat /etc/apt/sources.list.d/oneAPI.list || true cat /etc/apt/sources.list.d/intel-gpu-jammy.list || true whoami - name: Runner health check xpu-smi if: always() shell: bash run: | xpu-smi discovery - name: Runner health check GPU count if: always() shell: bash run: | ngpu=$(xpu-smi discovery | grep -c -E 'Device Name') msg="Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified" if [[ $ngpu -eq 0 ]]; then echo "Error: Failed to detect any GPUs on the runner" echo "$msg" exit 1 fi - name: Runner diskspace health check uses: ./.github/actions/diskspace-cleanup if: always() - name: Runner health check disconnect on failure if: ${{ failure() }} shell: bash run: | killall runsvc.sh - name: Preserve github env variables for use in docker shell: bash run: | env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}" env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}" - name: XPU set GPU_FLAG shell: bash run: | # Add render group for container creation. render_gid=`cat /etc/group | grep render | cut -d: -f3` echo "GPU_FLAG=--device=/dev/mem --device=/dev/dri --group-add video --group-add $render_gid" >> "${GITHUB_ENV}"