• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env bash
2# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8#     http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15# ==============================================================================
16#
17#
18# A script to run multiple GPU tests in parallel controlled with an environment
19# variable.
20#
21# Required environment variables:
22#     TF_GPU_COUNT = Number of GPUs available.
23
24TF_GPU_COUNT=${TF_GPU_COUNT:-4}
25TF_TESTS_PER_GPU=${TF_TESTS_PER_GPU:-8}
26
27# This function is used below in rlocation to check that a path is absolute
28function is_absolute {
29  [[ "$1" = /* ]] || [[ "$1" =~ ^[a-zA-Z]:[/\\].* ]]
30}
31
32export TF_PER_DEVICE_MEMORY_LIMIT_MB=${TF_PER_DEVICE_MEMORY_LIMIT_MB:-2048}
33
34# *******************************************************************
35#         This section of the script is needed to
36#         make things work on windows under msys.
37# *******************************************************************
38RUNFILES_MANIFEST_FILE="${TEST_SRCDIR}/MANIFEST"
39function rlocation() {
40  if is_absolute "$1" ; then
41    # If the file path is already fully specified, simply return it.
42    echo "$1"
43  elif [[ -e "$TEST_SRCDIR/$1" ]]; then
44    # If the file exists in the $TEST_SRCDIR then just use it.
45    echo "$TEST_SRCDIR/$1"
46  elif [[ -e "$RUNFILES_MANIFEST_FILE" ]]; then
47    # If a runfiles manifest file exists then use it.
48    echo "$(grep "^$1 " "$RUNFILES_MANIFEST_FILE" | sed 's/[^ ]* //')"
49  fi
50}
51
52TEST_BINARY="$(rlocation $TEST_WORKSPACE/${1#./})"
53shift
54# *******************************************************************
55
56mkdir -p /var/lock
57# Try to acquire any of the TF_GPU_COUNT * TF_TESTS_PER_GPU
58# slots to run a test at.
59#
60# Prefer to allocate 1 test per GPU over 4 tests on 1 GPU.
61# So, we iterate over TF_TESTS_PER_GPU first.
62for j in `seq 0 $((TF_TESTS_PER_GPU-1))`; do
63  for i in `seq 0 $((TF_GPU_COUNT-1))`; do
64    exec {lock_fd}>/var/lock/gpulock${i}_${j} || exit 1
65    if flock -n "$lock_fd";
66    then
67      (
68        # This export only works within the brackets, so it is isolated to one
69        # single command.
70        export CUDA_VISIBLE_DEVICES=$i
71        export HIP_VISIBLE_DEVICES=$i
72        echo "Running test $TEST_BINARY $* on GPU $CUDA_VISIBLE_DEVICES"
73        "$TEST_BINARY" $@
74      )
75      return_code=$?
76      flock -u "$lock_fd"
77      exit $return_code
78    fi
79  done
80done
81
82echo "Cannot find a free GPU to run the test $* on, exiting with failure..."
83exit 1
84