• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // The CUDA implementation of the StreamExecutorInterface functionality.
17 // CUDA inclusions are ideally confined to this implementation file.
18 //
19 // The notions from the StreamExecutor basically correspond to the CUDA streams
20 // programming model provided by the libcuda.so driver APIs, so we don't have
21 // to do much more than wrap the calls to the libraries appropriately.
22 #ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
23 #define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
24 
25 #include <set>
26 #include <unordered_map>
27 
28 #include "absl/strings/string_view.h"
29 #include "absl/synchronization/mutex.h"
30 #include "tensorflow/core/platform/thread_annotations.h"
31 #include "tensorflow/stream_executor/event.h"
32 #include "tensorflow/stream_executor/gpu/gpu_kernel.h"
33 #include "tensorflow/stream_executor/lib/status.h"
34 #include "tensorflow/stream_executor/lib/statusor.h"
35 #include "tensorflow/stream_executor/platform.h"
36 #include "tensorflow/stream_executor/platform/port.h"
37 #include "tensorflow/stream_executor/stream_executor_internal.h"
38 
39 namespace stream_executor {
40 namespace gpu {
41 
42 // CUDA-platform implementation of the platform-agnostic
43 // StreamExecutorInterface.
44 class GpuExecutor : public internal::StreamExecutorInterface {
45  public:
46   // sub_platform indicates the subplatform used in this executor; it must
47   // be a CUDA type.
GpuExecutor(const PluginConfig & plugin_config)48   explicit GpuExecutor(const PluginConfig& plugin_config)
49       : device_(0),
50         context_(nullptr),
51         device_ordinal_(0),
52         cc_major_(0),
53         cc_minor_(0),
54         version_(0),
55         plugin_config_(plugin_config) {}
56 
57   // See the corresponding StreamExecutor methods for method comments on the
58   // following overrides.
59 
60   ~GpuExecutor() override;
61 
62   port::Status Init(int device_ordinal, DeviceOptions device_options) override;
63 
64   port::Status GetKernel(const MultiKernelLoaderSpec& spec,
65                          KernelBase* kernel) override;
66   // (supported on CUDA only)
67   void UnloadKernel(const KernelBase* kernel) override;
68   port::Status LoadModule(const MultiModuleLoaderSpec& spec,
69                           ModuleHandle* module_handle) override;
70   bool UnloadModule(ModuleHandle module_handle) override;
71 
72   port::Status Launch(Stream* stream, const ThreadDim& thread_dims,
73                       const BlockDim& block_dims, const KernelBase& k,
74                       const KernelArgsArrayBase& args) override;
75 
76   // (supported on CUDA only)
77   int CalculateOccupancy(const DeviceDescription& device_description,
78                          uint64 registers_per_thread,
79                          uint64 shared_memory_per_block,
80                          const ThreadDim& thread_dims, GpuFunctionHandle func);
81 
82   // (supported on CUDA only)
83   int CompareOccupancy(int* initial_blocks,
84                        const DeviceDescription& device_description,
85                        uint64 registers_per_thread,
86                        uint64 shared_memory_per_block,
87                        const ThreadDim& thread_dims, GpuFunctionHandle func);
88 
89   DeviceMemoryBase Allocate(uint64 size, int64 memory_space) override;
90 
91   void* GetSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes,
92                      uint64 size_bytes) override;
93 
94   void Deallocate(DeviceMemoryBase* mem) override;
95 
UnifiedMemoryAllocate(uint64 size)96   void* UnifiedMemoryAllocate(uint64 size) override {
97     return GpuDriver::UnifiedMemoryAllocate(context_, size);
98   }
99 
UnifiedMemoryDeallocate(void * location)100   void UnifiedMemoryDeallocate(void* location) override {
101     return GpuDriver::UnifiedMemoryDeallocate(context_, location);
102   }
103 
104   // CUDA allocation/registration functions are necessary because the driver
105   // internally sets up buffers for DMA operations (and page locks them).
106   // There's no external interface for us to otherwise control these DMA
107   // settings.
HostMemoryAllocate(uint64 size)108   void* HostMemoryAllocate(uint64 size) override {
109     return GpuDriver::HostAllocate(context_, size);
110   }
111 
HostMemoryDeallocate(void * location)112   void HostMemoryDeallocate(void* location) override {
113     return GpuDriver::HostDeallocate(context_, location);
114   }
115 
116   bool HostMemoryRegister(void* location, uint64 size) override;
117 
118   bool HostMemoryUnregister(void* location) override;
119 
120   bool SynchronizeAllActivity() override;
121 
122   port::Status SynchronousMemZero(DeviceMemoryBase* location,
123                                   uint64 size) override;
124 
125   port::Status SynchronousMemSet(DeviceMemoryBase* location, int value,
126                                  uint64 size) override;
127 
128   port::Status SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
129                                  const void* host_src, uint64 size) override;
130 
131   port::Status SynchronousMemcpy(void* host_dst,
132                                  const DeviceMemoryBase& gpu_src,
133                                  uint64 size) override;
134 
135   port::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase* gpu_dst,
136                                                const DeviceMemoryBase& gpu_src,
137                                                uint64 size) override;
138 
139   port::Status MemZero(Stream* stream, DeviceMemoryBase* location,
140                        uint64 size) override;
141   port::Status Memset(Stream* stream, DeviceMemoryBase* location, uint8 pattern,
142                       uint64 size) override;
143   port::Status Memset32(Stream* stream, DeviceMemoryBase* location,
144                         uint32 pattern, uint64 size) override;
145 
146   bool Memcpy(Stream* stream, void* host_dst, const DeviceMemoryBase& gpu_src,
147               uint64 size) override;
148 
149   bool Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst, const void* host_src,
150               uint64 size) override;
151 
152   bool MemcpyDeviceToDevice(Stream* stream, DeviceMemoryBase* gpu_dst,
153                             const DeviceMemoryBase& gpu_src,
154                             uint64 size) override;
155 
156   bool HostCallback(Stream* stream,
157                     std::function<port::Status()> callback) override;
158 
159   bool AllocateStream(Stream* stream) override;
160 
161   void DeallocateStream(Stream* stream) override;
162 
163   bool CreateStreamDependency(Stream* dependent, Stream* other) override;
164 
165   bool AllocateTimer(Timer* timer) override;
166 
167   void DeallocateTimer(Timer* timer) override;
168 
169   bool StartTimer(Stream* stream, Timer* timer) override;
170 
171   bool StopTimer(Stream* stream, Timer* timer) override;
172 
173   port::Status AllocateEvent(Event* event) override;
174 
175   port::Status DeallocateEvent(Event* event) override;
176 
177   port::Status RecordEvent(Stream* stream, Event* event) override;
178 
179   port::Status WaitForEvent(Stream* stream, Event* event) override;
180 
181   Event::Status PollForEventStatus(Event* event) override;
182 
183   port::Status BlockHostUntilDone(Stream* stream) override;
184 
PlatformDeviceCount()185   int PlatformDeviceCount() override { return GpuDriver::GetDeviceCount(); }
186 
187   port::Status EnablePeerAccessTo(StreamExecutorInterface* other) override;
188 
189   bool CanEnablePeerAccessTo(StreamExecutorInterface* other) override;
190 
191   bool DeviceMemoryUsage(int64* free, int64* total) const override;
192 
193   // Search for the symbol and returns a device pointer and size.
194   // Returns false if symbol does not exist.
195   bool GetSymbol(const std::string& symbol_name, ModuleHandle module_handle,
196                  void** mem, size_t* bytes) override;
197 
CreateDeviceDescription()198   port::StatusOr<std::unique_ptr<DeviceDescription>> CreateDeviceDescription()
199       const override {
200     return CreateDeviceDescription(device_ordinal_);
201   }
202 
203   static port::StatusOr<std::unique_ptr<DeviceDescription>>
204   CreateDeviceDescription(int device_ordinal);
205 
206   bool SupportsBlas() const override;
207 
208   blas::BlasSupport* CreateBlas() override;
209 
210   bool SupportsFft() const override;
211 
212   fft::FftSupport* CreateFft() override;
213 
214   bool SupportsRng() const override;
215 
216   rng::RngSupport* CreateRng() override;
217 
218   bool SupportsDnn() const override;
219 
220   dnn::DnnSupport* CreateDnn() override;
221 
222   std::unique_ptr<internal::EventInterface> CreateEventImplementation()
223       override;
224 
225   std::unique_ptr<internal::KernelInterface> CreateKernelImplementation()
226       override;
227 
228   std::unique_ptr<internal::StreamInterface> GetStreamImplementation() override;
229 
230   std::unique_ptr<internal::TimerInterface> GetTimerImplementation() override;
231 
232   void* GpuContextHack() override;
233 
234   GpuContext* gpu_context();
235 
236  private:
237   // Attempts to find a more specific version of the file indicated by
238   // filename by looking for compute-capability-specific suffixed versions; i.e.
239   // looking for "foo.ptx" will check to see if "foo.ptx.cc30.ptx" is present if
240   // we're on a compute capability 3.0 machine.
241   // (supported on CUDA only)
242   bool FindOnDiskForComputeCapability(absl::string_view filename,
243                                       absl::string_view canonical_suffix,
244                                       std::string* found_filename) const;
245 
246   // Attempts to find a more specific version of the file indicated by
247   // filename by looking for AMDGPU ISA-specific suffixed versions.
248   // (supported on ROCm only)
249 
250   bool FindOnDiskForISAVersion(absl::string_view filename,
251                                absl::string_view canonical_suffix,
252                                std::string* found_filename) const;
253 
254   // Host callback landing routine invoked by CUDA.
255   // data: User-provided callback provided to HostCallback() above, captured
256   //       as a std::function<void()>. Allocated/initialized inside
257   //       HostCallback() and owned and deleted by this call.
258   static void InternalHostCallback(GpuStreamHandle stream, GpuStatus status,
259                                    void* data);
260 
261   // Collects metadata for the specified kernel.
262   port::Status GetKernelMetadata(GpuKernel* cuda_kernel,
263                                  KernelMetadata* kernel_metadata);
264 
265   // Prints to VLOG(2) information about the kernel's occupancy and how it might
266   // be improved.
267   void VlogOccupancyInfo(const KernelBase& kernel, const ThreadDim& thread_dims,
268                          const BlockDim& block_dims);
269 
270   // (supported on CUDA only)
271   port::Status LoadModuleFromCuBin(const char* cubin, GpuModuleHandle* module)
272       TF_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
273 
274   // Loads the PTX text `ptx` as a CUDA module.  `ptx` must be null terminated.
275   // (supported on CUDA only)
276   port::Status LoadModuleFromPtx(const char* ptx, GpuModuleHandle* module)
277       TF_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
278 
279   // (supported on ROCm only)
280   port::Status LoadModuleFromHsaco(const char* hsaco, GpuModuleHandle* module)
281       TF_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
282 
283   bool UnloadGpuBinary(const void* gpu_binary)
284       TF_EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
285 
286   // Guards the on-disk-module mapping.
287   absl::Mutex disk_modules_mu_;
288 
289   // Mapping from filename to GPUModuleHandle, if it was already retrieved.
290   // Multiple GPUFunctionHandle are usually obtained from a single
291   // GPUModuleHandle so we attempt to hit in this mapping first, before
292   // retrieving it.
293   std::map<std::string, GpuModuleHandle> disk_modules_
294       TF_GUARDED_BY(disk_modules_mu_);
295 
296   // Guards the in-memory-module mapping.
297   absl::Mutex in_memory_modules_mu_;
298 
299   std::map<const char*, GpuModuleHandle> in_memory_modules_
300       TF_GUARDED_BY(in_memory_modules_mu_);
301 
302   // Kernel -> loaded GPU binary. Many kernels may load the same binary.
303   std::unordered_map<const KernelBase*, const void*> kernel_to_gpu_binary_
304       TF_GUARDED_BY(in_memory_modules_mu_);
305   // GPU binary (PTX or CUBIN or HSACO) -> {CUDA module, reference count}.
306   std::unordered_map<const void*, std::pair<GpuModuleHandle, uint64>>
307       gpu_binary_to_module_ TF_GUARDED_BY(in_memory_modules_mu_);
308 
309   // Guards the launched kernel set.
310   absl::Mutex launched_kernels_mu_;
311 
312   // Keeps track of the set of launched kernels. Currently used to suppress the
313   // occupancy check on subsequent launches.
314   std::set<GpuFunctionHandle> launched_kernels_
315       TF_GUARDED_BY(launched_kernels_mu_);
316 
317   // Handle for the CUDA device being operated on. Immutable
318   // post-initialization.
319   GpuDeviceHandle device_;
320 
321   // Handle for session with the library/driver. Immutable post-initialization.
322   GpuContext* context_;
323 
324   // The device ordinal value that this executor was initialized with; recorded
325   // for use in getting device metadata. Immutable post-initialization.
326   int device_ordinal_;
327 
328   // The major version of the compute capability for device_.
329   int cc_major_;
330 
331   // The minor version of the compute capability for device_.
332   int cc_minor_;
333 
334   // GPU ISA version for device_.
335   int version_;
336 
337   // The plugin configuration associated with this instance.
338   PluginConfig plugin_config_;
339 
340   SE_DISALLOW_COPY_AND_ASSIGN(GpuExecutor);
341 };
342 
343 }  // namespace gpu
344 }  // namespace stream_executor
345 
346 #endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
347