• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // Describes the underlying platform for a StreamExecutor; e.g. OpenCL or CUDA
17 // device and platform properties. Also contains convenience functions for
18 // checking/calculating launch dimensionality based on device properties.
19 
20 #ifndef TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_
21 #define TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_
22 
23 #include <map>
24 #include <memory>
25 
26 #include "absl/base/macros.h"
27 #include "tensorflow/stream_executor/launch_dim.h"
28 #include "tensorflow/stream_executor/lib/statusor.h"
29 #include "tensorflow/stream_executor/platform/port.h"
30 
31 namespace stream_executor {
32 namespace internal {
33 class DeviceDescriptionBuilder;
34 }  // namespace internal
35 
36 // CUDA compute capability, as reported by the device description.
37 struct CudaComputeCapability {
38   int major = 0;
39   int minor = 0;
40 
41   // MSVC does not like "PASCAL" symbol.
42   enum CudaComputeCapabilities { PASCAL_ = 6, VOLTA = 7, AMPERE = 8 };
43 
CudaComputeCapabilityCudaComputeCapability44   CudaComputeCapability() {}
CudaComputeCapabilityCudaComputeCapability45   CudaComputeCapability(int major, int minor) {
46     this->major = major;
47     this->minor = minor;
48   }
49 
50   bool IsAtLeast(int other_major, int other_minor = 0) const {
51     return !(*this < CudaComputeCapability{other_major, other_minor});
52   }
53 
54   bool operator<(const CudaComputeCapability &other) const {
55     return ToPair() < other.ToPair();
56   }
57 
58   bool operator==(const CudaComputeCapability &other) const {
59     return ToPair() == other.ToPair();
60   }
61 
62   bool operator!=(const CudaComputeCapability &other) const {
63     return !(*this == other);
64   }
65 
ToStringCudaComputeCapability66   std::string ToString() const { return absl::StrCat(major, ".", minor); }
67 
ToPairCudaComputeCapability68   std::pair<int, int> ToPair() const { return std::make_pair(major, minor); }
69 };
70 
71 // Data that describes the execution target of the StreamExecutor, in terms of
72 // important logical parameters. These include dimensionality limits and
73 // physical parameters of interest, such as number of cores present on the
74 // device.
75 //
76 // Thread-safe: immutable post-initialization.
77 class DeviceDescription {
78  public:
79   // Returns the platform being run on; this value is primarily intended for
80   // printing, and comes out something like "OpenCL 1.2" or "Compute Capability
81   // 3.5".
platform_version()82   const std::string &platform_version() const { return platform_version_; }
83 
84   // Returns the driver version interfacing with the underlying platform. Vendor
85   // dependent format.
driver_version()86   const std::string &driver_version() const { return driver_version_; }
87 
88   // Return the runtime version, if one is provided by the underlying platform.
89   // Vendor dependent format / usefulness.
runtime_version()90   const std::string &runtime_version() const { return runtime_version_; }
91 
92   // Returns the name that the device reports. Vendor dependent.
name()93   const std::string &name() const { return name_; }
94 
95   // Returns the PCI bus identifier for this device, of the form
96   // [domain]:[bus]:[device].[function]
pci_bus_id()97   const std::string &pci_bus_id() const { return pci_bus_id_; }
98 
99   // Returns the NUMA node associated with this device, for use in
100   // determining socket locality. If the NUMA node could not be determined, -1
101   // is returned.
numa_node()102   int numa_node() const { return numa_node_; }
103 
104   // Number of cores (traditional notion of core; i.e. an SM on an NVIDIA device
105   // or an AMD Compute Unit.
core_count()106   int core_count() const { return core_count_; }
107 
108   // Returns the limit on the thread dimensionality values in each of the
109   // respective dimensions. These limits affect what constitutes a legitimate
110   // kernel launch request.
thread_dim_limit()111   const ThreadDim &thread_dim_limit() const { return thread_dim_limit_; }
112 
113   // Returns the limit on the block dimensionality values in each of the
114   // respective dimensions. These limits may affect what constitutes a
115   // legitimate kernel launch request.
block_dim_limit()116   const BlockDim &block_dim_limit() const { return block_dim_limit_; }
117 
118   // Returns the limit on the total number of threads that can be launched in a
119   // single block; i.e. the limit on x * y * z dimensions of a ThreadDim.
120   // This limit affects what constitutes a legitimate kernel launch request.
threads_per_block_limit()121   const int64 &threads_per_block_limit() const {
122     return threads_per_block_limit_;
123   }
124 
125   // Returns the limit on the total number of threads that can be simultaneously
126   // launched on a given multiprocessor.
threads_per_core_limit()127   const int64 &threads_per_core_limit() const {
128     return threads_per_core_limit_;
129   }
130 
131   // Returns the number of threads per warp/wavefront.
threads_per_warp()132   const int64 &threads_per_warp() const { return threads_per_warp_; }
133 
134   // Returns the limit on the total number of registers per core.
registers_per_core_limit()135   const int64 &registers_per_core_limit() const {
136     return registers_per_core_limit_;
137   }
138 
139   // Returns the limit on the total number of registers that can be
140   // simultaneously used by a block.
registers_per_block_limit()141   const int64 &registers_per_block_limit() const {
142     return registers_per_block_limit_;
143   }
144 
145   // Returns the number of address bits available to kernel code running on the
146   // platform. This affects things like the maximum allocation size and perhaps
147   // types used in kernel code such as size_t.
device_address_bits()148   const int64 &device_address_bits() const { return device_address_bits_; }
149 
150   // Returns the device memory size in bytes.
device_memory_size()151   int64 device_memory_size() const { return device_memory_size_; }
152 
153   // Returns the device's memory bandwidth in bytes/sec.  (This is for
154   // reads/writes to/from the device's own memory, not for transfers between the
155   // host and device.)
memory_bandwidth()156   int64 memory_bandwidth() const { return memory_bandwidth_; }
157 
158   // Returns the device's core clock rate in GHz.
clock_rate_ghz()159   float clock_rate_ghz() const { return clock_rate_ghz_; }
160 
161   // Returns whether ECC is enabled.
ecc_enabled()162   bool ecc_enabled() const { return ecc_enabled_; }
163 
164   // Returns the device vendor string, e.g., "NVIDIA Corporation", "Advanced
165   // Micro Devices, Inc.", or "GenuineIntel".
device_vendor()166   const std::string &device_vendor() const { return device_vendor_; }
167 
168   // Returns the CUDA compute capability if we're running on the CUDA platform.
169   // If a CUDA compute capability is not available, the major version will be
170   // zero.
171   CudaComputeCapability cuda_compute_capability() const;
172 
173   // Returns the AMDGPU ISA version if we're running on the ROCm platform.
174   // If the information is not available, the version is not modified,
175   // and the return value will be false.
176   bool rocm_amdgpu_isa_version(int *version) const;
177 
178   // Returns the
179   // * AMDGPU GCN Architecture Name if we're running on the ROCm platform.
180   // * kUndefinedString otherwise
rocm_amdgpu_gcn_arch_name()181   const std::string rocm_amdgpu_gcn_arch_name() const {
182     return rocm_amdgpu_gcn_arch_name_;
183   }
184 
185   // Returns the maximum amount of shared memory present on a single core
186   // (i.e. Streaming Multiprocessor on NVIDIA GPUs; Compute Unit for OpenCL
187   // devices). Note that some devices, such as NVIDIA's have a configurable
188   // partitioning between shared memory and L1 cache.
shared_memory_per_core()189   int64 shared_memory_per_core() const { return shared_memory_per_core_; }
190 
191   // Returns the maximum amount of shared memory available for a single block.
shared_memory_per_block()192   int64 shared_memory_per_block() const { return shared_memory_per_block_; }
193 
194   // TODO(leary): resident blocks per core will be useful.
195 
196   // Convenience typedef for the string-based DeviceDescription mapping.
197   typedef std::map<std::string, std::string> Map;
198 
199   // Returns a mapping from readable names to readable values that describe the
200   // device. This is useful for things like printing.
201   std::unique_ptr<Map> ToMap() const;
202 
203   // For string values that are not available via the underlying platform, this
204   // value will be provided.
205   static const char *kUndefinedString;
206 
207  private:
208   friend class internal::DeviceDescriptionBuilder;
209 
210   DeviceDescription();
211 
212   // For description of the following members, see the corresponding accessor
213   // above.
214   //
215   // N.B. If another field is added, update ToMap() above.
216   std::string device_vendor_;
217   std::string platform_version_;
218   std::string driver_version_;
219   std::string runtime_version_;
220   std::string pci_bus_id_;
221   std::string name_;
222 
223   ThreadDim thread_dim_limit_;
224   BlockDim block_dim_limit_;
225 
226   int64 threads_per_core_limit_;
227   int64 threads_per_block_limit_;
228   int64 threads_per_warp_;
229 
230   int64 registers_per_core_limit_;
231   int64 registers_per_block_limit_;
232 
233   int64 device_address_bits_;
234   int64 device_memory_size_;
235   int64 memory_bandwidth_;
236 
237   // Shared memory limits on a given device.
238   int64 shared_memory_per_core_;
239   int64 shared_memory_per_block_;
240 
241   float clock_rate_ghz_;
242 
243   // CUDA "CC" major value, -1 if not available.
244   CudaComputeCapability cuda_compute_capability_{-1, -1};
245 
246   // ROCM AMDGPU ISA version, 0 if not available.
247   int rocm_amdgpu_isa_version_;
248 
249   // ROCm AMDGPU GCN Architecture name, "" if not available.
250   std::string rocm_amdgpu_gcn_arch_name_;
251 
252   int numa_node_;
253   int core_count_;
254   bool ecc_enabled_;
255 
256   SE_DISALLOW_COPY_AND_ASSIGN(DeviceDescription);
257 };
258 
259 namespace internal {
260 
261 // Helper class the builds a device description, given that it has a large
262 // number of fields that would be easily confused in constructor form.
263 class DeviceDescriptionBuilder {
264  public:
265   DeviceDescriptionBuilder();
266 
267   // For descriptions of the following fields, see comments on the corresponding
268   // DeviceDescription::* accessors above.
269 
set_device_vendor(const std::string & value)270   void set_device_vendor(const std::string &value) {
271     device_description_->device_vendor_ = value;
272   }
set_platform_version(const std::string & value)273   void set_platform_version(const std::string &value) {
274     device_description_->platform_version_ = value;
275   }
set_driver_version(const std::string & value)276   void set_driver_version(const std::string &value) {
277     device_description_->driver_version_ = value;
278   }
set_runtime_version(const std::string & value)279   void set_runtime_version(const std::string &value) {
280     device_description_->runtime_version_ = value;
281   }
set_pci_bus_id(const std::string & value)282   void set_pci_bus_id(const std::string &value) {
283     device_description_->pci_bus_id_ = value;
284   }
set_name(const std::string & value)285   void set_name(const std::string &value) {
286     device_description_->name_ = value;
287   }
288 
set_thread_dim_limit(const ThreadDim & value)289   void set_thread_dim_limit(const ThreadDim &value) {
290     device_description_->thread_dim_limit_ = value;
291   }
set_block_dim_limit(const BlockDim & value)292   void set_block_dim_limit(const BlockDim &value) {
293     device_description_->block_dim_limit_ = value;
294   }
295 
set_threads_per_core_limit(int64_t value)296   void set_threads_per_core_limit(int64_t value) {
297     device_description_->threads_per_core_limit_ = value;
298   }
set_threads_per_block_limit(int64_t value)299   void set_threads_per_block_limit(int64_t value) {
300     device_description_->threads_per_block_limit_ = value;
301   }
set_threads_per_warp(int64_t value)302   void set_threads_per_warp(int64_t value) {
303     device_description_->threads_per_warp_ = value;
304   }
305 
set_registers_per_core_limit(int64_t value)306   void set_registers_per_core_limit(int64_t value) {
307     device_description_->registers_per_core_limit_ = value;
308   }
set_registers_per_block_limit(int64_t value)309   void set_registers_per_block_limit(int64_t value) {
310     device_description_->registers_per_block_limit_ = value;
311   }
312 
set_device_address_bits(int64_t value)313   void set_device_address_bits(int64_t value) {
314     device_description_->device_address_bits_ = value;
315   }
set_device_memory_size(int64_t value)316   void set_device_memory_size(int64_t value) {
317     device_description_->device_memory_size_ = value;
318   }
set_memory_bandwidth(int64_t value)319   void set_memory_bandwidth(int64_t value) {
320     device_description_->memory_bandwidth_ = value;
321   }
322 
set_shared_memory_per_core(int64_t value)323   void set_shared_memory_per_core(int64_t value) {
324     device_description_->shared_memory_per_core_ = value;
325   }
set_shared_memory_per_block(int64_t value)326   void set_shared_memory_per_block(int64_t value) {
327     device_description_->shared_memory_per_block_ = value;
328   }
329 
set_clock_rate_ghz(float value)330   void set_clock_rate_ghz(float value) {
331     device_description_->clock_rate_ghz_ = value;
332   }
333 
set_cuda_compute_capability(int major,int minor)334   void set_cuda_compute_capability(int major, int minor) {
335     device_description_->cuda_compute_capability_ =
336         CudaComputeCapability{major, minor};
337   }
338 
set_rocm_amdgpu_isa_version(int version)339   void set_rocm_amdgpu_isa_version(int version) {
340     device_description_->rocm_amdgpu_isa_version_ = version;
341   }
342 
set_rocm_amdgpu_gcn_arch_name(const std::string & gcn_arch_name)343   void set_rocm_amdgpu_gcn_arch_name(const std::string &gcn_arch_name) {
344     device_description_->rocm_amdgpu_gcn_arch_name_ = gcn_arch_name;
345   }
346 
set_numa_node(int value)347   void set_numa_node(int value) { device_description_->numa_node_ = value; }
set_core_count(int value)348   void set_core_count(int value) { device_description_->core_count_ = value; }
set_ecc_enabled(bool value)349   void set_ecc_enabled(bool value) {
350     device_description_->ecc_enabled_ = value;
351   }
352 
353   // Returns a built DeviceDescription with ownership transferred to the
354   // caller. There are currently no restrictions on which fields must be set in
355   // order to build the descriptor.
356   //
357   // Once the description is built, this builder object should be discarded.
Build()358   std::unique_ptr<DeviceDescription> Build() {
359     return std::move(device_description_);
360   }
361 
362  private:
363   std::unique_ptr<DeviceDescription> device_description_;
364 
365   SE_DISALLOW_COPY_AND_ASSIGN(DeviceDescriptionBuilder);
366 };
367 
368 }  // namespace internal
369 
370 // Returns whether the given thread_dim is acceptable given the limits described
371 // in device_description. For detailed reasons for failing the predicate, enable
372 // VLOG(2) for this module.
373 bool ThreadDimOk(const DeviceDescription &device_description,
374                  const ThreadDim &thread_dim);
375 
376 // Equivalent to ceil(double(element_count) / threads_per_block).
377 ABSL_DEPRECATED("Use MathUtil::CeilOfRatio directly instead.")
378 int64 DivideCeil(int64_t x, int64_t y);
379 
380 // Calculate the number of threads/blocks required to process element_count
381 // elements. Note that you can still end up with more threads than
382 // element_count due to rounding, so kernels often start with an "is this
383 // thread id in the element_count range?" test.
384 void CalculateDimensionality(const DeviceDescription &device_description,
385                              int64_t element_count, int64 *threads_per_block,
386                              int64 *block_count);
387 
388 }  // namespace stream_executor
389 
390 #endif  // TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_
391