• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1name: Setup Linux
2
3description: Set up Docker workspace on EC2
4
5runs:
6  using: composite
7  steps:
8    - name: Display EC2 information
9      shell: bash
10      run: |
11        set -euo pipefail
12        function get_ec2_metadata() {
13          # Pulled from instance metadata endpoint for EC2
14          # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
15          category=$1
16          # If it is GCP runner (runner name contains gcp), do not run this
17          runner_name_str=${{ runner.name }}
18          if [[ -f /.inarc ]]; then
19            echo "ARC Runner, no info on ec2 metadata"
20          elif [[ $runner_name_str == *"gcp"* ]]; then
21            echo "Runner is from Google Cloud Platform, No info on ec2 metadata"
22          else
23            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
24          fi
25        }
26        echo "ami-id: $(get_ec2_metadata ami-id)"
27        echo "instance-id: $(get_ec2_metadata instance-id)"
28        echo "instance-type: $(get_ec2_metadata instance-type)"
29        echo "system info $(uname -a)"
30
31    - name: Check if in a ARC runner
32      shell: bash
33      id: check_arc_runner
34      run: echo "IN_ARC_RUNNER=$([ -f /.inarc ] && echo true || echo false)"  >> $GITHUB_OUTPUT
35
36    - name: Start docker if docker deamon is not running
37      shell: bash
38      if: ${{ steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
39      run: |
40        if systemctl is-active --quiet docker; then
41            echo "Docker daemon is running...";
42        else
43            echo "Starting docker deamon..." && sudo systemctl start docker;
44        fi
45
46    - name: Log in to ECR
47      uses: nick-fields/retry@v3.0.0
48      env:
49        AWS_RETRY_MODE: standard
50        AWS_MAX_ATTEMPTS: "5"
51        AWS_DEFAULT_REGION: us-east-1
52      with:
53        shell: bash
54        timeout_minutes: 5
55        max_attempts: 3
56        retry_wait_seconds: 30
57        command: |
58          AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
59          aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
60              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
61
62          # For LF Runners we need to make sure we also login to Meta's ECR docker registry too.
63          META_AWS_ACCOUNT_ID=308535385114
64          if [ "$AWS_ACCOUNT_ID" != "$META_AWS_ACCOUNT_ID" ] ; then
65              aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
66                  --password-stdin "$META_AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
67          fi
68
69    - name: Preserve github env variables for use in docker
70      shell: bash
71      run: |
72        env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
73        env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
74
75    - name: Kill any existing containers, clean up images
76      if: ${{ steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
77      shell: bash
78      run: |
79        # ignore expansion of "docker ps -q" since it could be empty
80        # shellcheck disable=SC2046
81        docker stop $(docker ps -q) || true
82        # Prune all of the docker images
83        docker system prune -af
84
85    - name: Manually resolve download.pytorch.org
86      shell: bash
87      continue-on-error: true
88      run: |
89        set +e
90        set -x
91
92        PT_DOMAIN=download.pytorch.org
93        # TODO: Flaky access to download.pytorch.org https://github.com/pytorch/pytorch/issues/100400,
94        # cleaning this up once the issue is fixed. There are more than one resolved IP here, the last
95        # one is returned at random
96        RESOLVED_IP=$(dig -4 +short "${PT_DOMAIN}" | tail -n1)
97
98        if [ -z "${RESOLVED_IP}" ]; then
99          echo "Couldn't resolve ${PT_DOMAIN}, retrying with Google DNS..."
100          RESOLVED_IP=$(dig -4 +short "${PT_DOMAIN}" @8.8.8.8 | tail -n1)
101
102          if [ -z "${RESOLVED_IP}" ]; then
103            echo "Couldn't resolve ${PT_DOMAIN}, exiting..."
104            exit 1
105          fi
106        fi
107
108        if grep -r "${PT_DOMAIN}" /etc/hosts; then
109          # Clean up any old records first
110          sudo sed -i "/${PT_DOMAIN}/d" /etc/hosts
111        fi
112
113        echo "${RESOLVED_IP} ${PT_DOMAIN}" | sudo tee -a /etc/hosts
114        cat /etc/hosts
115
116    - name: Check that the docker daemon is running
117      shell: bash
118      continue-on-error: true
119      if: ${{ steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'true' }}
120      run: |
121        set +x
122
123        max_attempts=30
124        delay=10
125        attempt=1
126
127        for attempt in $(seq 1 $max_attempts); do
128          echo "Attempt $attempt of $max_attempts: Checking if Docker daemon is running..."
129          if docker info > /dev/null 2>&1; then
130            echo "Docker is running. Proceeding with the next steps"
131            exit 0
132          else
133            echo "Docker is not running yet."
134            echo "Retrying in $delay seconds..."
135            sleep $delay
136          fi
137        done
138        echo "Reached maximum attempts to connect to Docker. Exiting."
139        exit 1
140