1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 // The CUDA implementation of the StreamExecutorInterface functionality.
17 // CUDA inclusions are ideally confined to this implementation file.
18 //
19 // The notions from the StreamExecutor basically correspond to the CUDA streams
20 // programming model provided by the libcuda.so driver APIs, so we don't have
21 // to do much more than wrap the calls to the libraries appropriately.
22 #ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
23 #define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
24
25 #include <memory>
26 #include <set>
27 #include <type_traits>
28 #include <unordered_map>
29
30 #include "absl/strings/string_view.h"
31 #include "tensorflow/core/platform/mutex.h"
32 #include "tensorflow/core/platform/thread_annotations.h"
33 #include "tensorflow/stream_executor/event.h"
34 #include "tensorflow/stream_executor/gpu/gpu_kernel.h"
35 #include "tensorflow/stream_executor/lib/status.h"
36 #include "tensorflow/stream_executor/lib/statusor.h"
37 #include "tensorflow/stream_executor/platform.h"
38 #include "tensorflow/stream_executor/platform/port.h"
39 #include "tensorflow/stream_executor/stream_executor_internal.h"
40 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
41
42 namespace stream_executor {
43
44 class StreamExecutor;
45
46 namespace gpu {
47
48 // Pointer-to-implementation object type with virtual destruction for any XLA
49 // specific data hanging off of the GpuExecutor.
50 class XLAInterface {
51 public:
52 // Default constructor for the abstract interface.
XLAInterface()53 explicit XLAInterface() {}
54
55 // Default destructor for the abstract interface.
~XLAInterface()56 virtual ~XLAInterface() {}
57 };
58
59 // CUDA-platform implementation of the platform-agnostic
60 // StreamExecutorInterface.
61 class GpuExecutor : public internal::StreamExecutorInterface {
62 // Helper classes to attach a type erased state to the GpuExecutor. Currently,
63 // we just need to support some XLA specific state.
64 class Object {
65 struct Concept {
~ConceptConcept66 virtual ~Concept() {}
67 };
68 template <typename T>
69 struct Model : Concept {
ModelModel70 explicit Model(StreamExecutor* se) : object(se) {}
71 T object;
72 };
73
74 public:
75 template <typename T>
getOrCreate(StreamExecutor * se)76 T* getOrCreate(StreamExecutor* se) {
77 tensorflow::mutex_lock l(mu_);
78 if (!object_) {
79 object_ = std::make_unique<Model<T>>(se);
80 }
81 return &(dynamic_cast<Model<T>*>(object_.get())->object);
82 }
83
84 private:
85 tensorflow::mutex mu_;
86 std::unique_ptr<Concept> object_ ABSL_GUARDED_BY(mu_);
87 };
88
89 public:
90 // sub_platform indicates the subplatform used in this executor; it must
91 // be a CUDA type.
GpuExecutor(const PluginConfig & plugin_config)92 explicit GpuExecutor(const PluginConfig& plugin_config)
93 : device_(0),
94 context_(nullptr),
95 device_ordinal_(0),
96 cc_major_(0),
97 cc_minor_(0),
98 version_(0),
99 plugin_config_(plugin_config) {}
100
101 // See the corresponding StreamExecutor methods for method comments on the
102 // following overrides.
103
104 ~GpuExecutor() override;
105
106 port::Status Init(int device_ordinal, DeviceOptions device_options) override;
107
108 port::Status GetKernel(const MultiKernelLoaderSpec& spec,
109 KernelBase* kernel) override;
110 // (supported on CUDA only)
111 void UnloadKernel(const KernelBase* kernel) override;
112 port::Status LoadModule(const MultiModuleLoaderSpec& spec,
113 ModuleHandle* module_handle) override;
114 bool UnloadModule(ModuleHandle module_handle) override;
115
116 port::Status Launch(Stream* stream, const ThreadDim& thread_dims,
117 const BlockDim& block_dims, const KernelBase& k,
118 const KernelArgsArrayBase& args) override;
119
120 // (supported on CUDA only)
121 int CalculateOccupancy(const DeviceDescription& device_description,
122 uint64 registers_per_thread,
123 uint64 shared_memory_per_block,
124 const ThreadDim& thread_dims, GpuFunctionHandle func);
125
126 // (supported on CUDA only)
127 int CompareOccupancy(int* initial_blocks,
128 const DeviceDescription& device_description,
129 uint64 registers_per_thread,
130 uint64 shared_memory_per_block,
131 const ThreadDim& thread_dims, GpuFunctionHandle func);
132
133 DeviceMemoryBase Allocate(uint64 size, int64_t memory_space) override;
134
135 void* GetSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes,
136 uint64 size_bytes) override;
137
138 void Deallocate(DeviceMemoryBase* mem) override;
139
UnifiedMemoryAllocate(uint64 size)140 void* UnifiedMemoryAllocate(uint64 size) override {
141 return GpuDriver::UnifiedMemoryAllocate(context_, size);
142 }
143
UnifiedMemoryDeallocate(void * location)144 void UnifiedMemoryDeallocate(void* location) override {
145 return GpuDriver::UnifiedMemoryDeallocate(context_, location);
146 }
147
148 // CUDA allocation/registration functions are necessary because the driver
149 // internally sets up buffers for DMA operations (and page locks them).
150 // There's no external interface for us to otherwise control these DMA
151 // settings.
HostMemoryAllocate(uint64 size)152 void* HostMemoryAllocate(uint64 size) override {
153 return GpuDriver::HostAllocate(context_, size);
154 }
155
HostMemoryDeallocate(void * location)156 void HostMemoryDeallocate(void* location) override {
157 return GpuDriver::HostDeallocate(context_, location);
158 }
159
160 bool HostMemoryRegister(void* location, uint64 size) override;
161
162 bool HostMemoryUnregister(void* location) override;
163
164 bool SynchronizeAllActivity() override;
165
166 port::Status SynchronousMemZero(DeviceMemoryBase* location,
167 uint64 size) override;
168
169 port::Status SynchronousMemSet(DeviceMemoryBase* location, int value,
170 uint64 size) override;
171
172 port::Status SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
173 const void* host_src, uint64 size) override;
174
175 port::Status SynchronousMemcpy(void* host_dst,
176 const DeviceMemoryBase& gpu_src,
177 uint64 size) override;
178
179 port::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase* gpu_dst,
180 const DeviceMemoryBase& gpu_src,
181 uint64 size) override;
182
183 port::Status MemZero(Stream* stream, DeviceMemoryBase* location,
184 uint64 size) override;
185 port::Status Memset(Stream* stream, DeviceMemoryBase* location, uint8 pattern,
186 uint64 size) override;
187 port::Status Memset32(Stream* stream, DeviceMemoryBase* location,
188 uint32 pattern, uint64 size) override;
189
190 bool Memcpy(Stream* stream, void* host_dst, const DeviceMemoryBase& gpu_src,
191 uint64 size) override;
192
193 bool Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst, const void* host_src,
194 uint64 size) override;
195
196 bool MemcpyDeviceToDevice(Stream* stream, DeviceMemoryBase* gpu_dst,
197 const DeviceMemoryBase& gpu_src,
198 uint64 size) override;
199
200 bool HostCallback(Stream* stream,
201 std::function<port::Status()> callback) override;
202
203 bool AllocateStream(Stream* stream) override;
204
205 void DeallocateStream(Stream* stream) override;
206
207 bool CreateStreamDependency(Stream* dependent, Stream* other) override;
208
209 bool AllocateTimer(Timer* timer) override;
210
211 void DeallocateTimer(Timer* timer) override;
212
213 bool StartTimer(Stream* stream, Timer* timer) override;
214
215 bool StopTimer(Stream* stream, Timer* timer) override;
216
217 port::Status AllocateEvent(Event* event) override;
218
219 port::Status DeallocateEvent(Event* event) override;
220
221 port::Status RecordEvent(Stream* stream, Event* event) override;
222
223 port::Status WaitForEvent(Stream* stream, Event* event) override;
224
225 Event::Status PollForEventStatus(Event* event) override;
226
227 port::Status BlockHostUntilDone(Stream* stream) override;
228
PlatformDeviceCount()229 int PlatformDeviceCount() override { return GpuDriver::GetDeviceCount(); }
230
231 port::Status EnablePeerAccessTo(StreamExecutorInterface* other) override;
232
233 bool CanEnablePeerAccessTo(StreamExecutorInterface* other) override;
234
235 bool DeviceMemoryUsage(int64* free, int64* total) const override;
236
237 // Search for the symbol and returns a device pointer and size.
238 // Returns false if symbol does not exist.
239 bool GetSymbol(const std::string& symbol_name, ModuleHandle module_handle,
240 void** mem, size_t* bytes) override;
241
CreateDeviceDescription()242 port::StatusOr<std::unique_ptr<DeviceDescription>> CreateDeviceDescription()
243 const override {
244 return CreateDeviceDescription(device_ordinal_);
245 }
246
247 static port::StatusOr<std::unique_ptr<DeviceDescription>>
248 CreateDeviceDescription(int device_ordinal);
249
250 bool SupportsBlas() const override;
251
252 blas::BlasSupport* CreateBlas() override;
253
254 bool SupportsFft() const override;
255
256 fft::FftSupport* CreateFft() override;
257
258 bool SupportsRng() const override;
259
260 rng::RngSupport* CreateRng() override;
261
262 bool SupportsDnn() const override;
263
264 dnn::DnnSupport* CreateDnn() override;
265
266 std::unique_ptr<internal::EventInterface> CreateEventImplementation()
267 override;
268
269 std::unique_ptr<internal::KernelInterface> CreateKernelImplementation()
270 override;
271
272 std::unique_ptr<internal::StreamInterface> GetStreamImplementation() override;
273
274 std::unique_ptr<internal::TimerInterface> GetTimerImplementation() override;
275
276 void* GpuContextHack() override;
277
278 GpuContext* gpu_context();
279
280 // Provide a type-erased way of attaching arbitrary XLA specific state to the
281 // GpuExecutor. XLA based execution will use this method to attach per-stream
282 // executor XLA specific objects (like the Infeed and Outfeed managers) to the
283 // stream executor, so that their lifetimes can be tied to the lifetime of the
284 // stream executor for which that object is allocated for. This simplifies
285 // memory management as compared to having these objects reside on the side
286 // and then either leaking or having to implement callbacks that the SE
287 // destructors call to deallocate any side state that is associated with that
288 // SE object.
289 template <typename T>
getOrCreateXLAState(StreamExecutor * se)290 T* getOrCreateXLAState(StreamExecutor* se) {
291 return xla_state_.getOrCreate<T>(se);
292 }
293
294 private:
295 // Attempts to find a more specific version of the file indicated by
296 // filename by looking for compute-capability-specific suffixed versions; i.e.
297 // looking for "foo.ptx" will check to see if "foo.ptx.cc30.ptx" is present if
298 // we're on a compute capability 3.0 machine.
299 // (supported on CUDA only)
300 bool FindOnDiskForComputeCapability(absl::string_view filename,
301 absl::string_view canonical_suffix,
302 std::string* found_filename) const;
303
304 // Attempts to find a more specific version of the file indicated by
305 // filename by looking for AMDGPU ISA-specific suffixed versions.
306 // (supported on ROCm only)
307
308 bool FindOnDiskForISAVersion(absl::string_view filename,
309 absl::string_view canonical_suffix,
310 std::string* found_filename) const;
311
312 // Host callback landing routine invoked by CUDA.
313 // data: User-provided callback provided to HostCallback() above, captured
314 // as a std::function<void()>. Allocated/initialized inside
315 // HostCallback() and owned and deleted by this call.
316 static void InternalHostCallback(GpuStreamHandle stream, GpuStatus status,
317 void* data);
318
319 // Collects metadata for the specified kernel.
320 port::Status GetKernelMetadata(GpuKernel* cuda_kernel,
321 KernelMetadata* kernel_metadata);
322
323 // Prints to VLOG(2) information about the kernel's occupancy and how it might
324 // be improved.
325 void VlogOccupancyInfo(const KernelBase& kernel, const ThreadDim& thread_dims,
326 const BlockDim& block_dims);
327
328 // (supported on CUDA only)
329 port::Status LoadModuleFromCuBin(const char* cubin, GpuModuleHandle* module)
330 TF_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
331
332 // Loads the PTX text `ptx` as a CUDA module. `ptx` must be null terminated.
333 // (supported on CUDA only)
334 port::Status LoadModuleFromPtx(const char* ptx, GpuModuleHandle* module)
335 TF_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
336
337 // (supported on ROCm only)
338 port::Status LoadModuleFromHsaco(const char* hsaco, GpuModuleHandle* module)
339 TF_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
340
341 bool UnloadGpuBinary(const void* gpu_binary)
342 TF_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
343
344 // Guards the on-disk-module mapping.
345 absl::Mutex disk_modules_mu_;
346
347 // Mapping from filename to GPUModuleHandle, if it was already retrieved.
348 // Multiple GPUFunctionHandle are usually obtained from a single
349 // GPUModuleHandle so we attempt to hit in this mapping first, before
350 // retrieving it.
351 std::map<std::string, GpuModuleHandle> disk_modules_
352 TF_GUARDED_BY(disk_modules_mu_);
353
354 // Guards the in-memory-module mapping.
355 absl::Mutex in_memory_modules_mu_;
356
357 std::map<const char*, GpuModuleHandle> in_memory_modules_
358 TF_GUARDED_BY(in_memory_modules_mu_);
359
360 // Kernel -> loaded GPU binary. Many kernels may load the same binary.
361 std::unordered_map<const KernelBase*, const void*> kernel_to_gpu_binary_
362 TF_GUARDED_BY(in_memory_modules_mu_);
363 // GPU binary (PTX or CUBIN or HSACO) -> {CUDA module, reference count}.
364 std::unordered_map<const void*, std::pair<GpuModuleHandle, uint64>>
365 gpu_binary_to_module_ TF_GUARDED_BY(in_memory_modules_mu_);
366
367 // Guards the launched kernel set.
368 absl::Mutex launched_kernels_mu_;
369
370 // Keeps track of the set of launched kernels. Currently used to suppress the
371 // occupancy check on subsequent launches.
372 std::set<GpuFunctionHandle> launched_kernels_
373 TF_GUARDED_BY(launched_kernels_mu_);
374
375 // Handle for the CUDA device being operated on. Immutable
376 // post-initialization.
377 GpuDeviceHandle device_;
378
379 // Handle for session with the library/driver. Immutable post-initialization.
380 GpuContext* context_;
381
382 // The device ordinal value that this executor was initialized with; recorded
383 // for use in getting device metadata. Immutable post-initialization.
384 int device_ordinal_;
385
386 // The major version of the compute capability for device_.
387 int cc_major_;
388
389 // The minor version of the compute capability for device_.
390 int cc_minor_;
391
392 // GPU ISA version for device_.
393 int version_;
394
395 // The plugin configuration associated with this instance.
396 PluginConfig plugin_config_;
397
398 // Type erased XLA specific state attached to GpuExecutor.
399 Object xla_state_;
400
401 SE_DISALLOW_COPY_AND_ASSIGN(GpuExecutor);
402 };
403
ExtractGpuExecutor(StreamExecutor * stream_exec)404 inline GpuExecutor* ExtractGpuExecutor(StreamExecutor* stream_exec) {
405 return static_cast<GpuExecutor*>(stream_exec->implementation());
406 }
407
408 } // namespace gpu
409 } // namespace stream_executor
410
411 #endif // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
412