1name: Setup Linux 2 3description: Set up Docker workspace on EC2 4 5runs: 6 using: composite 7 steps: 8 - name: Display EC2 information 9 shell: bash 10 run: | 11 set -euo pipefail 12 function get_ec2_metadata() { 13 # Pulled from instance metadata endpoint for EC2 14 # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html 15 category=$1 16 # If it is GCP runner (runner name contains gcp), do not run this 17 runner_name_str=${{ runner.name }} 18 if [[ -f /.inarc ]]; then 19 echo "ARC Runner, no info on ec2 metadata" 20 elif [[ $runner_name_str == *"gcp"* ]]; then 21 echo "Runner is from Google Cloud Platform, No info on ec2 metadata" 22 else 23 curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" 24 fi 25 } 26 echo "ami-id: $(get_ec2_metadata ami-id)" 27 echo "instance-id: $(get_ec2_metadata instance-id)" 28 echo "instance-type: $(get_ec2_metadata instance-type)" 29 echo "system info $(uname -a)" 30 31 - name: Check if in a ARC runner 32 shell: bash 33 id: check_arc_runner 34 run: echo "IN_ARC_RUNNER=$([ -f /.inarc ] && echo true || echo false)" >> $GITHUB_OUTPUT 35 36 - name: Start docker if docker deamon is not running 37 shell: bash 38 if: ${{ steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }} 39 run: | 40 if systemctl is-active --quiet docker; then 41 echo "Docker daemon is running..."; 42 else 43 echo "Starting docker deamon..." && sudo systemctl start docker; 44 fi 45 46 - name: Log in to ECR 47 uses: nick-fields/retry@v3.0.0 48 env: 49 AWS_RETRY_MODE: standard 50 AWS_MAX_ATTEMPTS: "5" 51 AWS_DEFAULT_REGION: us-east-1 52 with: 53 shell: bash 54 timeout_minutes: 5 55 max_attempts: 3 56 retry_wait_seconds: 30 57 command: | 58 AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") 59 aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ 60 --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" 61 62 # For LF Runners we need to make sure we also login to Meta's ECR docker registry too. 63 META_AWS_ACCOUNT_ID=308535385114 64 if [ "$AWS_ACCOUNT_ID" != "$META_AWS_ACCOUNT_ID" ] ; then 65 aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ 66 --password-stdin "$META_AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" 67 fi 68 69 - name: Preserve github env variables for use in docker 70 shell: bash 71 run: | 72 env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}" 73 env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}" 74 75 - name: Kill any existing containers, clean up images 76 if: ${{ steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }} 77 shell: bash 78 run: | 79 # ignore expansion of "docker ps -q" since it could be empty 80 # shellcheck disable=SC2046 81 docker stop $(docker ps -q) || true 82 # Prune all of the docker images 83 docker system prune -af 84 85 - name: Manually resolve download.pytorch.org 86 shell: bash 87 continue-on-error: true 88 run: | 89 set +e 90 set -x 91 92 PT_DOMAIN=download.pytorch.org 93 # TODO: Flaky access to download.pytorch.org https://github.com/pytorch/pytorch/issues/100400, 94 # cleaning this up once the issue is fixed. There are more than one resolved IP here, the last 95 # one is returned at random 96 RESOLVED_IP=$(dig -4 +short "${PT_DOMAIN}" | tail -n1) 97 98 if [ -z "${RESOLVED_IP}" ]; then 99 echo "Couldn't resolve ${PT_DOMAIN}, retrying with Google DNS..." 100 RESOLVED_IP=$(dig -4 +short "${PT_DOMAIN}" @8.8.8.8 | tail -n1) 101 102 if [ -z "${RESOLVED_IP}" ]; then 103 echo "Couldn't resolve ${PT_DOMAIN}, exiting..." 104 exit 1 105 fi 106 fi 107 108 if grep -r "${PT_DOMAIN}" /etc/hosts; then 109 # Clean up any old records first 110 sudo sed -i "/${PT_DOMAIN}/d" /etc/hosts 111 fi 112 113 echo "${RESOLVED_IP} ${PT_DOMAIN}" | sudo tee -a /etc/hosts 114 cat /etc/hosts 115 116 - name: Check that the docker daemon is running 117 shell: bash 118 continue-on-error: true 119 if: ${{ steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'true' }} 120 run: | 121 set +x 122 123 max_attempts=30 124 delay=10 125 attempt=1 126 127 for attempt in $(seq 1 $max_attempts); do 128 echo "Attempt $attempt of $max_attempts: Checking if Docker daemon is running..." 129 if docker info > /dev/null 2>&1; then 130 echo "Docker is running. Proceeding with the next steps" 131 exit 0 132 else 133 echo "Docker is not running yet." 134 echo "Retrying in $delay seconds..." 135 sleep $delay 136 fi 137 done 138 echo "Reached maximum attempts to connect to Docker. Exiting." 139 exit 1 140