• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // Describes the underlying platform for a StreamExecutor; e.g. OpenCL or CUDA
17 // device and platform properties. Also contains convenience functions for
18 // checking/calculating launch dimensionality based on device properties.
19 
20 #ifndef TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_
21 #define TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_
22 
23 #include <map>
24 #include <memory>
25 #include "absl/base/macros.h"
26 #include "tensorflow/stream_executor/launch_dim.h"
27 #include "tensorflow/stream_executor/platform/port.h"
28 
29 namespace stream_executor {
30 namespace internal {
31 class DeviceDescriptionBuilder;
32 }  // namespace internal
33 
34 // Data that describes the execution target of the StreamExecutor, in terms of
35 // important logical parameters. These include dimensionality limits and
36 // physical parameters of interest, such as number of cores present on the
37 // device.
38 //
39 // Thread-safe: immutable post-initialization.
40 class DeviceDescription {
41  public:
42   // Returns the platform being run on; this value is primarily intended for
43   // printing, and comes out something like "OpenCL 1.2" or "Compute Capability
44   // 3.5".
platform_version()45   const string &platform_version() const { return platform_version_; }
46 
47   // Returns the driver version interfacing with the underlying platform. Vendor
48   // dependent format.
driver_version()49   const string &driver_version() const { return driver_version_; }
50 
51   // Return the runtime version, if one is provided by the underlying platform.
52   // Vendor dependent format / usefulness.
runtime_version()53   const string &runtime_version() const { return runtime_version_; }
54 
55   // Returns the name that the device reports. Vendor dependent.
name()56   const string &name() const { return name_; }
57 
58   // Returns the PCI bus identifier for this device, of the form
59   // [domain]:[bus]:[device].[function]
pci_bus_id()60   const string &pci_bus_id() const { return pci_bus_id_; }
61 
62   // Returns the NUMA node associated with this device, for use in
63   // determining socket locality. If the NUMA node could not be determined, -1
64   // is returned.
numa_node()65   int numa_node() const { return numa_node_; }
66 
67   // Number of cores (traditional notion of core; i.e. an SM on an NVIDIA device
68   // or an AMD Compute Unit.
core_count()69   int core_count() const { return core_count_; }
70 
71   // Returns the limit on the thread dimensionality values in each of the
72   // respective dimensions. These limits affect what constitutes a legitimate
73   // kernel launch request.
thread_dim_limit()74   const ThreadDim &thread_dim_limit() const { return thread_dim_limit_; }
75 
76   // Returns the limit on the block dimensionality values in each of the
77   // respective dimensions. These limits may affect what constitutes a
78   // legitimate kernel launch request.
block_dim_limit()79   const BlockDim &block_dim_limit() const { return block_dim_limit_; }
80 
81   // Returns the limit on the total number of threads that can be launched in a
82   // single block; i.e. the limit on x * y * z dimensions of a ThreadDim.
83   // This limit affects what constitutes a legitimate kernel launch request.
threads_per_block_limit()84   const int64 &threads_per_block_limit() const {
85     return threads_per_block_limit_;
86   }
87 
88   // Returns the limit on the total number of threads that can be simultaneously
89   // launched on a given multiprocessor.
threads_per_core_limit()90   const int64 &threads_per_core_limit() const {
91     return threads_per_core_limit_;
92   }
93 
94   // Returns the number of threads per warp/wavefront.
threads_per_warp()95   const int64 &threads_per_warp() const { return threads_per_warp_; }
96 
97   // Returns the limit on the total number of registers per core.
registers_per_core_limit()98   const int64 &registers_per_core_limit() const {
99     return registers_per_core_limit_;
100   }
101 
102   // Returns the limit on the total number of registers that can be
103   // simultaneously used by a block.
registers_per_block_limit()104   const int64 &registers_per_block_limit() const {
105     return registers_per_block_limit_;
106   }
107 
108   // Returns the number of address bits available to kernel code running on the
109   // platform. This affects things like the maximum allocation size and perhaps
110   // types used in kernel code such as size_t.
device_address_bits()111   const int64 &device_address_bits() const { return device_address_bits_; }
112 
113   // Returns the device memory size in bytes.
device_memory_size()114   int64 device_memory_size() const { return device_memory_size_; }
115 
116   // Returns the device's memory bandwidth in bytes/sec.  (This is for
117   // reads/writes to/from the device's own memory, not for transfers between the
118   // host and device.)
memory_bandwidth()119   int64 memory_bandwidth() const { return memory_bandwidth_; }
120 
121   // Returns the device's core clock rate in GHz.
clock_rate_ghz()122   float clock_rate_ghz() const { return clock_rate_ghz_; }
123 
124   // Returns whether ECC is enabled.
ecc_enabled()125   bool ecc_enabled() const { return ecc_enabled_; }
126 
127   // Returns the device vendor string, e.g., "NVIDIA Corporation", "Advanced
128   // Micro Devices, Inc.", or "GenuineIntel".
device_vendor()129   const string &device_vendor() const { return device_vendor_; }
130 
131   // Returns the CUDA compute capability if we're running on the CUDA platform.
132   // If a CUDA compute capability is not available, the major version will be
133   // zero, and the return value will be false.
134   bool cuda_compute_capability(int *major, int *minor) const;
135 
136   // Returns the AMDGPU ISA version if we're running on the ROCm platform.
137   // If the information is not available, the version is not modified,
138   // and the return value will be false.
139   bool rocm_amdgpu_isa_version(int *version) const;
140 
141   // Returns the maximum amount of shared memory present on a single core
142   // (i.e. Streaming Multiprocessor on NVIDIA GPUs; Compute Unit for OpenCL
143   // devices). Note that some devices, such as NVIDIA's have a configurable
144   // partitioning between shared memory and L1 cache.
shared_memory_per_core()145   int64 shared_memory_per_core() const { return shared_memory_per_core_; }
146 
147   // Returns the maximum amount of shared memory available for a single block.
shared_memory_per_block()148   int64 shared_memory_per_block() const { return shared_memory_per_block_; }
149 
150   // TODO(leary): resident blocks per core will be useful.
151 
152   // Convenience typedef for the string-based DeviceDescription mapping.
153   typedef std::map<string, string> Map;
154 
155   // Returns a mapping from readable names to readable values that describe the
156   // device. This is useful for things like printing.
157   std::unique_ptr<Map> ToMap() const;
158 
159   // For string values that are not available via the underlying platform, this
160   // value will be provided.
161   static const char *kUndefinedString;
162 
163  private:
164   friend class internal::DeviceDescriptionBuilder;
165 
166   DeviceDescription();
167 
168   // For description of the following members, see the corresponding accessor
169   // above.
170   //
171   // N.B. If another field is added, update ToMap() above.
172   string device_vendor_;
173   string platform_version_;
174   string driver_version_;
175   string runtime_version_;
176   string pci_bus_id_;
177   string name_;
178 
179   ThreadDim thread_dim_limit_;
180   BlockDim block_dim_limit_;
181 
182   int64 threads_per_core_limit_;
183   int64 threads_per_block_limit_;
184   int64 threads_per_warp_;
185 
186   int64 registers_per_core_limit_;
187   int64 registers_per_block_limit_;
188 
189   int64 device_address_bits_;
190   int64 device_memory_size_;
191   int64 memory_bandwidth_;
192 
193   // Shared memory limits on a given device.
194   int64 shared_memory_per_core_;
195   int64 shared_memory_per_block_;
196 
197   float clock_rate_ghz_;
198 
199   // CUDA "CC" major value, -1 if not available.
200   int cuda_compute_capability_major_;
201   int cuda_compute_capability_minor_;
202 
203   // ROCM AMDGPU ISA version, 0 if not available.
204   int rocm_amdgpu_isa_version_;
205 
206   int numa_node_;
207   int core_count_;
208   bool ecc_enabled_;
209 
210   SE_DISALLOW_COPY_AND_ASSIGN(DeviceDescription);
211 };
212 
213 namespace internal {
214 
215 // Helper class the builds a device description, given that it has a large
216 // number of fields that would be easily confused in constructor form.
217 class DeviceDescriptionBuilder {
218  public:
219   DeviceDescriptionBuilder();
220 
221   // For descriptions of the following fields, see comments on the corresponding
222   // DeviceDescription::* accessors above.
223 
set_device_vendor(const string & value)224   void set_device_vendor(const string &value) {
225     device_description_->device_vendor_ = value;
226   }
set_platform_version(const string & value)227   void set_platform_version(const string &value) {
228     device_description_->platform_version_ = value;
229   }
set_driver_version(const string & value)230   void set_driver_version(const string &value) {
231     device_description_->driver_version_ = value;
232   }
set_runtime_version(const string & value)233   void set_runtime_version(const string &value) {
234     device_description_->runtime_version_ = value;
235   }
set_pci_bus_id(const string & value)236   void set_pci_bus_id(const string &value) {
237     device_description_->pci_bus_id_ = value;
238   }
set_name(const string & value)239   void set_name(const string &value) { device_description_->name_ = value; }
240 
set_thread_dim_limit(const ThreadDim & value)241   void set_thread_dim_limit(const ThreadDim &value) {
242     device_description_->thread_dim_limit_ = value;
243   }
set_block_dim_limit(const BlockDim & value)244   void set_block_dim_limit(const BlockDim &value) {
245     device_description_->block_dim_limit_ = value;
246   }
247 
set_threads_per_core_limit(int64 value)248   void set_threads_per_core_limit(int64 value) {
249     device_description_->threads_per_core_limit_ = value;
250   }
set_threads_per_block_limit(int64 value)251   void set_threads_per_block_limit(int64 value) {
252     device_description_->threads_per_block_limit_ = value;
253   }
set_threads_per_warp(int64 value)254   void set_threads_per_warp(int64 value) {
255     device_description_->threads_per_warp_ = value;
256   }
257 
set_registers_per_core_limit(int64 value)258   void set_registers_per_core_limit(int64 value) {
259     device_description_->registers_per_core_limit_ = value;
260   }
set_registers_per_block_limit(int64 value)261   void set_registers_per_block_limit(int64 value) {
262     device_description_->registers_per_block_limit_ = value;
263   }
264 
set_device_address_bits(int64 value)265   void set_device_address_bits(int64 value) {
266     device_description_->device_address_bits_ = value;
267   }
set_device_memory_size(int64 value)268   void set_device_memory_size(int64 value) {
269     device_description_->device_memory_size_ = value;
270   }
set_memory_bandwidth(int64 value)271   void set_memory_bandwidth(int64 value) {
272     device_description_->memory_bandwidth_ = value;
273   }
274 
set_shared_memory_per_core(int64 value)275   void set_shared_memory_per_core(int64 value) {
276     device_description_->shared_memory_per_core_ = value;
277   }
set_shared_memory_per_block(int64 value)278   void set_shared_memory_per_block(int64 value) {
279     device_description_->shared_memory_per_block_ = value;
280   }
281 
set_clock_rate_ghz(float value)282   void set_clock_rate_ghz(float value) {
283     device_description_->clock_rate_ghz_ = value;
284   }
285 
set_cuda_compute_capability(int major,int minor)286   void set_cuda_compute_capability(int major, int minor) {
287     device_description_->cuda_compute_capability_major_ = major;
288     device_description_->cuda_compute_capability_minor_ = minor;
289   }
290 
set_rocm_amdgpu_isa_version(int version)291   void set_rocm_amdgpu_isa_version(int version) {
292     device_description_->rocm_amdgpu_isa_version_ = version;
293   }
294 
set_numa_node(int value)295   void set_numa_node(int value) { device_description_->numa_node_ = value; }
set_core_count(int value)296   void set_core_count(int value) { device_description_->core_count_ = value; }
set_ecc_enabled(bool value)297   void set_ecc_enabled(bool value) {
298     device_description_->ecc_enabled_ = value;
299   }
300 
301   // Returns a built DeviceDescription with ownership transferred to the
302   // caller. There are currently no restrictions on which fields must be set in
303   // order to build the descriptor.
304   //
305   // Once the description is built, this builder object should be discarded.
Build()306   std::unique_ptr<DeviceDescription> Build() {
307     return std::move(device_description_);
308   }
309 
310  private:
311   std::unique_ptr<DeviceDescription> device_description_;
312 
313   SE_DISALLOW_COPY_AND_ASSIGN(DeviceDescriptionBuilder);
314 };
315 
316 }  // namespace internal
317 
318 // Returns whether the given thread_dim is acceptable given the limits described
319 // in device_description. For detailed reasons for failing the predicate, enable
320 // VLOG(2) for this module.
321 bool ThreadDimOk(const DeviceDescription &device_description,
322                  const ThreadDim &thread_dim);
323 
324 // Equivalent to ceil(double(element_count) / threads_per_block).
325 ABSL_DEPRECATED("Use MathUtil::CeilOfRatio directly instead.")
326 int64 DivideCeil(int64 x, int64 y);
327 
328 // Calculate the number of threads/blocks required to process element_count
329 // elements. Note that you can still end up with more threads than
330 // element_count due to rounding, so kernels often start with an "is this
331 // thread id in the element_count range?" test.
332 void CalculateDimensionality(const DeviceDescription &device_description,
333                              int64 element_count, int64 *threads_per_block,
334                              int64 *block_count);
335 
336 }  // namespace stream_executor
337 
338 #endif  // TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_
339