• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/compiler/xla/service/platform_util.h"
17 
18 #include <algorithm>
19 #include <string>
20 #include <utility>
21 
22 #include "absl/strings/ascii.h"
23 #include "absl/strings/str_join.h"
24 #include "tensorflow/compiler/xla/debug_options_flags.h"
25 #include "tensorflow/compiler/xla/service/compiler.h"
26 #include "tensorflow/compiler/xla/status_macros.h"
27 #include "tensorflow/compiler/xla/statusor.h"
28 #include "tensorflow/compiler/xla/types.h"
29 #include "tensorflow/compiler/xla/util.h"
30 #include "tensorflow/core/lib/core/threadpool.h"
31 #include "tensorflow/core/platform/logging.h"
32 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
33 
34 namespace xla {
35 
36 // Minimum supported CUDA compute capability is 3.5.
37 constexpr int kMinCudaComputeCapabilityMajor = 3;
38 constexpr int kMinCudaComputeCapabilityMinor = 5;
39 
40 // Minimum supported AMDGPU ISA version is 803.
41 constexpr int kMinAMDGPUISAVersion = 803;
42 
43 // The name of the interpreter platform.
44 constexpr char kInterpreter[] = "interpreter";
45 
46 namespace {
47 
CanonicalPlatformName(const string & platform_name)48 string CanonicalPlatformName(const string& platform_name) {
49   string lowercase_platform_name = absl::AsciiStrToLower(platform_name);
50   // "cpu" and "host" mean the same thing.
51   if (lowercase_platform_name == "cpu") {
52     return "host";
53   }
54   // When configured on CUDA, "gpu" and "cuda" mean the same thing.
55   // When configured on ROCm, "gpu" and "rocm" mean the same thing.
56   if (lowercase_platform_name == "gpu") {
57 #if TENSORFLOW_USE_ROCM
58     return "rocm";
59 #else
60     return "cuda";
61 #endif
62   }
63   return lowercase_platform_name;
64 }
65 
GetSupportedPlatforms()66 StatusOr<std::vector<se::Platform*>> GetSupportedPlatforms() {
67   return se::MultiPlatformManager::PlatformsWithFilter(
68       [](const se::Platform* platform) {
69         auto compiler_status = Compiler::GetForPlatform(platform);
70         bool supported = compiler_status.ok();
71         if (!supported) {
72           LOG(INFO) << "platform " << platform->Name() << " present but no "
73                     << "XLA compiler available: "
74                     << compiler_status.status().error_message();
75         }
76         return supported;
77       });
78 }
79 
80 }  // namespace
81 
82 /* static */ StatusOr<std::vector<se::Platform*>>
GetSupportedPlatforms()83 PlatformUtil::GetSupportedPlatforms() {
84   // Gather all platforms which have an XLA compiler.
85   return xla::GetSupportedPlatforms();
86 }
87 
GetDefaultPlatform()88 /* static */ StatusOr<se::Platform*> PlatformUtil::GetDefaultPlatform() {
89   TF_ASSIGN_OR_RETURN(auto platforms, GetSupportedPlatforms());
90 
91   se::Platform* platform = nullptr;
92   if (platforms.empty()) {
93     return NotFound("no platforms found");
94   } else if (platforms.size() == 1) {
95     platform = platforms[0];
96   } else if (platforms.size() == 2) {
97     for (int i = 0; i < 2; i++) {
98       if (absl::AsciiStrToLower(platforms[i]->Name()) == kInterpreter &&
99           absl::AsciiStrToLower(platforms[1 - i]->Name()) != kInterpreter) {
100         platform = platforms[1 - i];
101         break;
102       }
103     }
104   }
105   if (platform != nullptr) {
106     return platform;
107   }
108 
109   // Multiple platforms present and we can't pick a reasonable default.
110   string platforms_string = absl::StrJoin(
111       platforms, ", ",
112       [](string* out, const se::Platform* p) { out->append(p->Name()); });
113   return InvalidArgument(
114       "must specify platform because more than one platform (except for the "
115       "interpreter platform) found: %s",
116       platforms_string);
117 }
118 
GetPlatform(const string & platform_name)119 /*static*/ StatusOr<se::Platform*> PlatformUtil::GetPlatform(
120     const string& platform_name) {
121   TF_ASSIGN_OR_RETURN(se::Platform * platform,
122                       se::MultiPlatformManager::PlatformWithName(
123                           CanonicalPlatformName(platform_name)));
124   TF_RETURN_IF_ERROR(Compiler::GetForPlatform(platform).status());
125   return platform;
126 }
127 
128 // Returns whether the device underlying the given StreamExecutor is supported
129 // by XLA.
IsDeviceSupported(se::StreamExecutor * executor)130 static bool IsDeviceSupported(se::StreamExecutor* executor) {
131   const auto& description = executor->GetDeviceDescription();
132   if (executor->platform()->id() == se::cuda::kCudaPlatformId) {
133     // CUDA devices must have a minimum compute capability.
134     int major_version, minor_version;
135     if (description.cuda_compute_capability(&major_version, &minor_version)) {
136       if (major_version < kMinCudaComputeCapabilityMajor ||
137           (major_version == kMinCudaComputeCapabilityMajor &&
138            minor_version < kMinCudaComputeCapabilityMinor)) {
139         LOG(INFO) << "StreamExecutor cuda device ("
140                   << executor->device_ordinal() << ") is of "
141                   << "insufficient compute capability: "
142                   << kMinCudaComputeCapabilityMajor << "."
143                   << kMinCudaComputeCapabilityMinor << " required, "
144                   << "device is " << major_version << "." << minor_version;
145         return false;
146       }
147     }
148   } else if (executor->platform()->id() == se::rocm::kROCmPlatformId) {
149     int isa_version = 0;
150     if (description.rocm_amdgpu_isa_version(&isa_version)) {
151       if (isa_version < kMinAMDGPUISAVersion) {
152         LOG(INFO) << "StreamExecutor ROCM device ("
153                   << executor->device_ordinal() << ") is of "
154                   << "obsolete AMDGPU ISA version: "
155                   << "gfx" << kMinAMDGPUISAVersion << " required, "
156                   << "device is gfx" << isa_version;
157         return false;
158       }
159     }
160   }
161   return true;
162 }
163 
164 /* static */ StatusOr<std::vector<se::StreamExecutor*>>
GetStreamExecutors(se::Platform * platform,const absl::optional<std::set<int>> & allowed_devices)165 PlatformUtil::GetStreamExecutors(
166     se::Platform* platform,
167     const absl::optional<std::set<int>>& allowed_devices) {
168   int device_count = platform->VisibleDeviceCount();
169   if (device_count <= 0) {
170     return NotFound("no %s devices found", platform->Name());
171   }
172   if (platform->id() == se::host::kHostPlatformId) {
173     // On host "devices", StreamExecutor exports a device for each hardware
174     // thread. Because we parallelize a single computation across threads, it
175     // doesn't make sense to expose these as separate devices, so by default we
176     // fix the number of devices to one.  However we do let the user override
177     // this behavior to help run tests on the host that run models in parallel
178     // across multiple devices.
179     device_count =
180         GetDebugOptionsFromFlags().xla_force_host_platform_device_count();
181   }
182   std::vector<se::StreamExecutor*> stream_executors(device_count, nullptr);
183   VLOG(1) << "Initializing devices";
184   {
185     tensorflow::thread::ThreadPool thread_pool(
186         tensorflow::Env::Default(), "device_initialization", device_count);
187     for (int i = 0; i < device_count; ++i) {
188       // Once a stream executor is instantiated it will cause allocations on
189       // the device, for example for GPUs cuda context, cudnn handles etc. will
190       // be constructed. By constructing stream executors only on the
191       // allowed_devices, we don't make any allocations on other devices.
192       // This helps in multi-process executions on the same host like horovod or
193       // shared hosts.
194       if (allowed_devices && allowed_devices->count(i) == 0) {
195         VLOG(1) << "Not initializing StreamExecutor for device " << i
196                 << " since it is not in the visible device list";
197         continue;
198       }
199       thread_pool.Schedule([platform, i, &stream_executors]() {
200         VLOG(1) << "Started device init " << i;
201         se::StreamExecutorConfig config;
202         config.ordinal = i;
203         auto executor_status = platform->GetExecutor(config);
204         if (executor_status.ok()) {
205           se::StreamExecutor* executor = executor_status.ValueOrDie();
206           if (IsDeviceSupported(executor)) {
207             stream_executors[i] = executor;
208           }
209         } else {
210           LOG(WARNING) << "unable to create StreamExecutor for "
211                        << platform->Name() << ":" << i << ": "
212                        << executor_status.status().error_message();
213         }
214         VLOG(1) << "Finished device init " << i;
215       });
216     }
217     // Block here in thread_pool destructor until all devices are initialized.
218   }
219   VLOG(1) << "Device initialization complete";
220 
221   std::vector<se::StreamExecutor*> out;
222   for (se::StreamExecutor* executor : stream_executors) {
223     if (executor != nullptr) {
224       out.push_back(executor);
225     }
226   }
227   if (out.empty()) {
228     return InternalError("no supported devices found for platform %s",
229                          platform->Name());
230   }
231   return out;
232 }
233 
234 }  // namespace xla
235