• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // The CUDA implementation of the StreamExecutorInterface functionality.
17 // CUDA inclusions are ideally confined to this implementation file.
18 //
19 // The notions from the StreamExecutor basically correspond to the CUDA streams
20 // programming model provided by the libcuda.so driver APIs, so we don't have
21 // to do much more than wrap the calls to the libraries appropriately.
22 #ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
23 #define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
24 
25 #include <memory>
26 #include <set>
27 #include <type_traits>
28 #include <unordered_map>
29 
30 #include "absl/strings/string_view.h"
31 #include "tensorflow/core/platform/mutex.h"
32 #include "tensorflow/core/platform/thread_annotations.h"
33 #include "tensorflow/stream_executor/event.h"
34 #include "tensorflow/stream_executor/gpu/gpu_kernel.h"
35 #include "tensorflow/stream_executor/lib/status.h"
36 #include "tensorflow/stream_executor/lib/statusor.h"
37 #include "tensorflow/stream_executor/platform.h"
38 #include "tensorflow/stream_executor/platform/port.h"
39 #include "tensorflow/stream_executor/stream_executor_internal.h"
40 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
41 
42 namespace stream_executor {
43 
44 class StreamExecutor;
45 
46 namespace gpu {
47 
48 // Pointer-to-implementation object type with virtual destruction for any XLA
49 // specific data hanging off of the GpuExecutor.
50 class XLAInterface {
51  public:
52   // Default constructor for the abstract interface.
XLAInterface()53   explicit XLAInterface() {}
54 
55   // Default destructor for the abstract interface.
~XLAInterface()56   virtual ~XLAInterface() {}
57 };
58 
59 // CUDA-platform implementation of the platform-agnostic
60 // StreamExecutorInterface.
61 class GpuExecutor : public internal::StreamExecutorInterface {
62   // Helper classes to attach a type erased state to the GpuExecutor. Currently,
63   // we just need to support some XLA specific state.
64   class Object {
65     struct Concept {
~ConceptConcept66       virtual ~Concept() {}
67     };
68     template <typename T>
69     struct Model : Concept {
ModelModel70       explicit Model(StreamExecutor* se) : object(se) {}
71       T object;
72     };
73 
74    public:
75     template <typename T>
getOrCreate(StreamExecutor * se)76     T* getOrCreate(StreamExecutor* se) {
77       tensorflow::mutex_lock l(mu_);
78       if (!object_) {
79         object_ = std::make_unique<Model<T>>(se);
80       }
81       return &(dynamic_cast<Model<T>*>(object_.get())->object);
82     }
83 
84    private:
85     tensorflow::mutex mu_;
86     std::unique_ptr<Concept> object_ ABSL_GUARDED_BY(mu_);
87   };
88 
89  public:
90   // sub_platform indicates the subplatform used in this executor; it must
91   // be a CUDA type.
GpuExecutor(const PluginConfig & plugin_config)92   explicit GpuExecutor(const PluginConfig& plugin_config)
93       : device_(0),
94         context_(nullptr),
95         device_ordinal_(0),
96         cc_major_(0),
97         cc_minor_(0),
98         version_(0),
99         plugin_config_(plugin_config) {}
100 
101   // See the corresponding StreamExecutor methods for method comments on the
102   // following overrides.
103 
104   ~GpuExecutor() override;
105 
106   port::Status Init(int device_ordinal, DeviceOptions device_options) override;
107 
108   port::Status GetKernel(const MultiKernelLoaderSpec& spec,
109                          KernelBase* kernel) override;
110   // (supported on CUDA only)
111   void UnloadKernel(const KernelBase* kernel) override;
112   port::Status LoadModule(const MultiModuleLoaderSpec& spec,
113                           ModuleHandle* module_handle) override;
114   bool UnloadModule(ModuleHandle module_handle) override;
115 
116   port::Status Launch(Stream* stream, const ThreadDim& thread_dims,
117                       const BlockDim& block_dims, const KernelBase& k,
118                       const KernelArgsArrayBase& args) override;
119 
120   // (supported on CUDA only)
121   int CalculateOccupancy(const DeviceDescription& device_description,
122                          uint64 registers_per_thread,
123                          uint64 shared_memory_per_block,
124                          const ThreadDim& thread_dims, GpuFunctionHandle func);
125 
126   // (supported on CUDA only)
127   int CompareOccupancy(int* initial_blocks,
128                        const DeviceDescription& device_description,
129                        uint64 registers_per_thread,
130                        uint64 shared_memory_per_block,
131                        const ThreadDim& thread_dims, GpuFunctionHandle func);
132 
133   DeviceMemoryBase Allocate(uint64 size, int64_t memory_space) override;
134 
135   void* GetSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes,
136                      uint64 size_bytes) override;
137 
138   void Deallocate(DeviceMemoryBase* mem) override;
139 
UnifiedMemoryAllocate(uint64 size)140   void* UnifiedMemoryAllocate(uint64 size) override {
141     return GpuDriver::UnifiedMemoryAllocate(context_, size);
142   }
143 
UnifiedMemoryDeallocate(void * location)144   void UnifiedMemoryDeallocate(void* location) override {
145     return GpuDriver::UnifiedMemoryDeallocate(context_, location);
146   }
147 
148   // CUDA allocation/registration functions are necessary because the driver
149   // internally sets up buffers for DMA operations (and page locks them).
150   // There's no external interface for us to otherwise control these DMA
151   // settings.
HostMemoryAllocate(uint64 size)152   void* HostMemoryAllocate(uint64 size) override {
153     return GpuDriver::HostAllocate(context_, size);
154   }
155 
HostMemoryDeallocate(void * location)156   void HostMemoryDeallocate(void* location) override {
157     return GpuDriver::HostDeallocate(context_, location);
158   }
159 
160   bool HostMemoryRegister(void* location, uint64 size) override;
161 
162   bool HostMemoryUnregister(void* location) override;
163 
164   bool SynchronizeAllActivity() override;
165 
166   port::Status SynchronousMemZero(DeviceMemoryBase* location,
167                                   uint64 size) override;
168 
169   port::Status SynchronousMemSet(DeviceMemoryBase* location, int value,
170                                  uint64 size) override;
171 
172   port::Status SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
173                                  const void* host_src, uint64 size) override;
174 
175   port::Status SynchronousMemcpy(void* host_dst,
176                                  const DeviceMemoryBase& gpu_src,
177                                  uint64 size) override;
178 
179   port::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase* gpu_dst,
180                                                const DeviceMemoryBase& gpu_src,
181                                                uint64 size) override;
182 
183   port::Status MemZero(Stream* stream, DeviceMemoryBase* location,
184                        uint64 size) override;
185   port::Status Memset(Stream* stream, DeviceMemoryBase* location, uint8 pattern,
186                       uint64 size) override;
187   port::Status Memset32(Stream* stream, DeviceMemoryBase* location,
188                         uint32 pattern, uint64 size) override;
189 
190   bool Memcpy(Stream* stream, void* host_dst, const DeviceMemoryBase& gpu_src,
191               uint64 size) override;
192 
193   bool Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst, const void* host_src,
194               uint64 size) override;
195 
196   bool MemcpyDeviceToDevice(Stream* stream, DeviceMemoryBase* gpu_dst,
197                             const DeviceMemoryBase& gpu_src,
198                             uint64 size) override;
199 
200   bool HostCallback(Stream* stream,
201                     std::function<port::Status()> callback) override;
202 
203   bool AllocateStream(Stream* stream) override;
204 
205   void DeallocateStream(Stream* stream) override;
206 
207   bool CreateStreamDependency(Stream* dependent, Stream* other) override;
208 
209   bool AllocateTimer(Timer* timer) override;
210 
211   void DeallocateTimer(Timer* timer) override;
212 
213   bool StartTimer(Stream* stream, Timer* timer) override;
214 
215   bool StopTimer(Stream* stream, Timer* timer) override;
216 
217   port::Status AllocateEvent(Event* event) override;
218 
219   port::Status DeallocateEvent(Event* event) override;
220 
221   port::Status RecordEvent(Stream* stream, Event* event) override;
222 
223   port::Status WaitForEvent(Stream* stream, Event* event) override;
224 
225   Event::Status PollForEventStatus(Event* event) override;
226 
227   port::Status BlockHostUntilDone(Stream* stream) override;
228 
PlatformDeviceCount()229   int PlatformDeviceCount() override { return GpuDriver::GetDeviceCount(); }
230 
231   port::Status EnablePeerAccessTo(StreamExecutorInterface* other) override;
232 
233   bool CanEnablePeerAccessTo(StreamExecutorInterface* other) override;
234 
235   bool DeviceMemoryUsage(int64* free, int64* total) const override;
236 
237   // Search for the symbol and returns a device pointer and size.
238   // Returns false if symbol does not exist.
239   bool GetSymbol(const std::string& symbol_name, ModuleHandle module_handle,
240                  void** mem, size_t* bytes) override;
241 
CreateDeviceDescription()242   port::StatusOr<std::unique_ptr<DeviceDescription>> CreateDeviceDescription()
243       const override {
244     return CreateDeviceDescription(device_ordinal_);
245   }
246 
247   static port::StatusOr<std::unique_ptr<DeviceDescription>>
248   CreateDeviceDescription(int device_ordinal);
249 
250   bool SupportsBlas() const override;
251 
252   blas::BlasSupport* CreateBlas() override;
253 
254   bool SupportsFft() const override;
255 
256   fft::FftSupport* CreateFft() override;
257 
258   bool SupportsRng() const override;
259 
260   rng::RngSupport* CreateRng() override;
261 
262   bool SupportsDnn() const override;
263 
264   dnn::DnnSupport* CreateDnn() override;
265 
266   std::unique_ptr<internal::EventInterface> CreateEventImplementation()
267       override;
268 
269   std::unique_ptr<internal::KernelInterface> CreateKernelImplementation()
270       override;
271 
272   std::unique_ptr<internal::StreamInterface> GetStreamImplementation() override;
273 
274   std::unique_ptr<internal::TimerInterface> GetTimerImplementation() override;
275 
276   void* GpuContextHack() override;
277 
278   GpuContext* gpu_context();
279 
280   // Provide a type-erased way of attaching arbitrary XLA specific state to the
281   // GpuExecutor. XLA based execution will use this method to attach per-stream
282   // executor XLA specific objects (like the Infeed and Outfeed managers) to the
283   // stream executor, so that their lifetimes can be tied to the lifetime of the
284   // stream executor for which that object is allocated for. This simplifies
285   // memory management as compared to having these objects reside on the side
286   // and then either leaking or having to implement callbacks that the SE
287   // destructors call to deallocate any side state that is associated with that
288   // SE object.
289   template <typename T>
getOrCreateXLAState(StreamExecutor * se)290   T* getOrCreateXLAState(StreamExecutor* se) {
291     return xla_state_.getOrCreate<T>(se);
292   }
293 
294  private:
295   // Attempts to find a more specific version of the file indicated by
296   // filename by looking for compute-capability-specific suffixed versions; i.e.
297   // looking for "foo.ptx" will check to see if "foo.ptx.cc30.ptx" is present if
298   // we're on a compute capability 3.0 machine.
299   // (supported on CUDA only)
300   bool FindOnDiskForComputeCapability(absl::string_view filename,
301                                       absl::string_view canonical_suffix,
302                                       std::string* found_filename) const;
303 
304   // Attempts to find a more specific version of the file indicated by
305   // filename by looking for AMDGPU ISA-specific suffixed versions.
306   // (supported on ROCm only)
307 
308   bool FindOnDiskForISAVersion(absl::string_view filename,
309                                absl::string_view canonical_suffix,
310                                std::string* found_filename) const;
311 
312   // Host callback landing routine invoked by CUDA.
313   // data: User-provided callback provided to HostCallback() above, captured
314   //       as a std::function<void()>. Allocated/initialized inside
315   //       HostCallback() and owned and deleted by this call.
316   static void InternalHostCallback(GpuStreamHandle stream, GpuStatus status,
317                                    void* data);
318 
319   // Collects metadata for the specified kernel.
320   port::Status GetKernelMetadata(GpuKernel* cuda_kernel,
321                                  KernelMetadata* kernel_metadata);
322 
323   // Prints to VLOG(2) information about the kernel's occupancy and how it might
324   // be improved.
325   void VlogOccupancyInfo(const KernelBase& kernel, const ThreadDim& thread_dims,
326                          const BlockDim& block_dims);
327 
328   // (supported on CUDA only)
329   port::Status LoadModuleFromCuBin(const char* cubin, GpuModuleHandle* module)
330       TF_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
331 
332   // Loads the PTX text `ptx` as a CUDA module.  `ptx` must be null terminated.
333   // (supported on CUDA only)
334   port::Status LoadModuleFromPtx(const char* ptx, GpuModuleHandle* module)
335       TF_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
336 
337   // (supported on ROCm only)
338   port::Status LoadModuleFromHsaco(const char* hsaco, GpuModuleHandle* module)
339       TF_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
340 
341   bool UnloadGpuBinary(const void* gpu_binary)
342       TF_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
343 
344   // Guards the on-disk-module mapping.
345   absl::Mutex disk_modules_mu_;
346 
347   // Mapping from filename to GPUModuleHandle, if it was already retrieved.
348   // Multiple GPUFunctionHandle are usually obtained from a single
349   // GPUModuleHandle so we attempt to hit in this mapping first, before
350   // retrieving it.
351   std::map<std::string, GpuModuleHandle> disk_modules_
352       TF_GUARDED_BY(disk_modules_mu_);
353 
354   // Guards the in-memory-module mapping.
355   absl::Mutex in_memory_modules_mu_;
356 
357   std::map<const char*, GpuModuleHandle> in_memory_modules_
358       TF_GUARDED_BY(in_memory_modules_mu_);
359 
360   // Kernel -> loaded GPU binary. Many kernels may load the same binary.
361   std::unordered_map<const KernelBase*, const void*> kernel_to_gpu_binary_
362       TF_GUARDED_BY(in_memory_modules_mu_);
363   // GPU binary (PTX or CUBIN or HSACO) -> {CUDA module, reference count}.
364   std::unordered_map<const void*, std::pair<GpuModuleHandle, uint64>>
365       gpu_binary_to_module_ TF_GUARDED_BY(in_memory_modules_mu_);
366 
367   // Guards the launched kernel set.
368   absl::Mutex launched_kernels_mu_;
369 
370   // Keeps track of the set of launched kernels. Currently used to suppress the
371   // occupancy check on subsequent launches.
372   std::set<GpuFunctionHandle> launched_kernels_
373       TF_GUARDED_BY(launched_kernels_mu_);
374 
375   // Handle for the CUDA device being operated on. Immutable
376   // post-initialization.
377   GpuDeviceHandle device_;
378 
379   // Handle for session with the library/driver. Immutable post-initialization.
380   GpuContext* context_;
381 
382   // The device ordinal value that this executor was initialized with; recorded
383   // for use in getting device metadata. Immutable post-initialization.
384   int device_ordinal_;
385 
386   // The major version of the compute capability for device_.
387   int cc_major_;
388 
389   // The minor version of the compute capability for device_.
390   int cc_minor_;
391 
392   // GPU ISA version for device_.
393   int version_;
394 
395   // The plugin configuration associated with this instance.
396   PluginConfig plugin_config_;
397 
398   // Type erased XLA specific state attached to GpuExecutor.
399   Object xla_state_;
400 
401   SE_DISALLOW_COPY_AND_ASSIGN(GpuExecutor);
402 };
403 
ExtractGpuExecutor(StreamExecutor * stream_exec)404 inline GpuExecutor* ExtractGpuExecutor(StreamExecutor* stream_exec) {
405   return static_cast<GpuExecutor*>(stream_exec->implementation());
406 }
407 
408 }  // namespace gpu
409 }  // namespace stream_executor
410 
411 #endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
412