1#!/usr/bin/env bash 2# Copyright 2016 The TensorFlow Authors. All Rights Reserved. 3# 4# Licensed under the Apache License, Version 2.0 (the "License"); 5# you may not use this file except in compliance with the License. 6# You may obtain a copy of the License at 7# 8# http://www.apache.org/licenses/LICENSE-2.0 9# 10# Unless required by applicable law or agreed to in writing, software 11# distributed under the License is distributed on an "AS IS" BASIS, 12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13# See the License for the specific language governing permissions and 14# limitations under the License. 15# ============================================================================== 16# 17# 18# A script to run multiple GPU tests in parallel controlled with an environment 19# variable. 20# 21# Required environment variables: 22# TF_GPU_COUNT = Number of GPUs available. 23 24TF_GPU_COUNT=${TF_GPU_COUNT:-4} 25TF_TESTS_PER_GPU=${TF_TESTS_PER_GPU:-8} 26 27# This function is used below in rlocation to check that a path is absolute 28function is_absolute { 29 [[ "$1" = /* ]] || [[ "$1" =~ ^[a-zA-Z]:[/\\].* ]] 30} 31 32export TF_PER_DEVICE_MEMORY_LIMIT_MB=${TF_PER_DEVICE_MEMORY_LIMIT_MB:-2048} 33 34# ******************************************************************* 35# This section of the script is needed to 36# make things work on windows under msys. 37# ******************************************************************* 38RUNFILES_MANIFEST_FILE="${TEST_SRCDIR}/MANIFEST" 39function rlocation() { 40 if is_absolute "$1" ; then 41 # If the file path is already fully specified, simply return it. 42 echo "$1" 43 elif [[ -e "$TEST_SRCDIR/$1" ]]; then 44 # If the file exists in the $TEST_SRCDIR then just use it. 45 echo "$TEST_SRCDIR/$1" 46 elif [[ -e "$RUNFILES_MANIFEST_FILE" ]]; then 47 # If a runfiles manifest file exists then use it. 48 echo "$(grep "^$1 " "$RUNFILES_MANIFEST_FILE" | sed 's/[^ ]* //')" 49 fi 50} 51 52TEST_BINARY="$(rlocation $TEST_WORKSPACE/${1#./})" 53shift 54# ******************************************************************* 55 56mkdir -p /var/lock 57# Try to acquire any of the TF_GPU_COUNT * TF_TESTS_PER_GPU 58# slots to run a test at. 59# 60# Prefer to allocate 1 test per GPU over 4 tests on 1 GPU. 61# So, we iterate over TF_TESTS_PER_GPU first. 62for j in `seq 0 $((TF_TESTS_PER_GPU-1))`; do 63 for i in `seq 0 $((TF_GPU_COUNT-1))`; do 64 exec {lock_fd}>/var/lock/gpulock${i}_${j} || exit 1 65 if flock -n "$lock_fd"; 66 then 67 ( 68 # This export only works within the brackets, so it is isolated to one 69 # single command. 70 export CUDA_VISIBLE_DEVICES=$i 71 export HIP_VISIBLE_DEVICES=$i 72 echo "Running test $TEST_BINARY $* on GPU $CUDA_VISIBLE_DEVICES" 73 "$TEST_BINARY" $@ 74 ) 75 return_code=$? 76 flock -u "$lock_fd" 77 exit $return_code 78 fi 79 done 80done 81 82echo "Cannot find a free GPU to run the test $* on, exiting with failure..." 83exit 1 84