• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1name: Setup XPU host
2
3description: Set up XPU host for CI
4
5runs:
6  using: composite
7  steps:
8    - name: Clean all stopped docker containers
9      if: always()
10      shell: bash
11      run: |
12        # Prune all stopped containers.
13        # If other runner is pruning on this node, will skip.
14        nprune=$(ps -ef | grep -c "docker container prune")
15        if [[ $nprune -eq 1 ]]; then
16          docker container prune -f
17        fi
18
19    - name: Runner health check system info
20      if: always()
21      shell: bash
22      run: |
23        cat /etc/os-release || true
24        cat /etc/apt/sources.list.d/oneAPI.list || true
25        cat /etc/apt/sources.list.d/intel-gpu-jammy.list || true
26        whoami
27
28    - name: Runner health check xpu-smi
29      if: always()
30      shell: bash
31      run: |
32        xpu-smi discovery
33
34    - name: Runner health check GPU count
35      if: always()
36      shell: bash
37      run: |
38        ngpu=$(xpu-smi discovery | grep -c -E 'Device Name')
39        msg="Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
40        if [[ $ngpu -eq 0 ]]; then
41          echo "Error: Failed to detect any GPUs on the runner"
42          echo "$msg"
43          exit 1
44        fi
45
46    - name: Runner diskspace health check
47      uses: ./.github/actions/diskspace-cleanup
48      if: always()
49
50    - name: Runner health check disconnect on failure
51      if: ${{ failure() }}
52      shell: bash
53      run: |
54        killall runsvc.sh
55
56    - name: Preserve github env variables for use in docker
57      shell: bash
58      run: |
59        env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
60        env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
61
62    - name: XPU set GPU_FLAG
63      shell: bash
64      run: |
65        # Add render group for container creation.
66        render_gid=`cat /etc/group | grep render | cut -d: -f3`
67        echo "GPU_FLAG=--device=/dev/mem --device=/dev/dri --group-add video --group-add $render_gid" >> "${GITHUB_ENV}"
68