1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // Describes the underlying platform for a StreamExecutor; e.g. OpenCL or CUDA 17 // device and platform properties. Also contains convenience functions for 18 // checking/calculating launch dimensionality based on device properties. 19 20 #ifndef TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_ 21 #define TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_ 22 23 #include <map> 24 #include <memory> 25 #include "absl/base/macros.h" 26 #include "tensorflow/stream_executor/launch_dim.h" 27 #include "tensorflow/stream_executor/platform/port.h" 28 29 namespace stream_executor { 30 namespace internal { 31 class DeviceDescriptionBuilder; 32 } // namespace internal 33 34 // Data that describes the execution target of the StreamExecutor, in terms of 35 // important logical parameters. These include dimensionality limits and 36 // physical parameters of interest, such as number of cores present on the 37 // device. 38 // 39 // Thread-safe: immutable post-initialization. 40 class DeviceDescription { 41 public: 42 // Returns the platform being run on; this value is primarily intended for 43 // printing, and comes out something like "OpenCL 1.2" or "Compute Capability 44 // 3.5". platform_version()45 const string &platform_version() const { return platform_version_; } 46 47 // Returns the driver version interfacing with the underlying platform. Vendor 48 // dependent format. driver_version()49 const string &driver_version() const { return driver_version_; } 50 51 // Return the runtime version, if one is provided by the underlying platform. 52 // Vendor dependent format / usefulness. runtime_version()53 const string &runtime_version() const { return runtime_version_; } 54 55 // Returns the name that the device reports. Vendor dependent. name()56 const string &name() const { return name_; } 57 58 // Returns the PCI bus identifier for this device, of the form 59 // [domain]:[bus]:[device].[function] pci_bus_id()60 const string &pci_bus_id() const { return pci_bus_id_; } 61 62 // Returns the NUMA node associated with this device, for use in 63 // determining socket locality. If the NUMA node could not be determined, -1 64 // is returned. numa_node()65 int numa_node() const { return numa_node_; } 66 67 // Number of cores (traditional notion of core; i.e. an SM on an NVIDIA device 68 // or an AMD Compute Unit. core_count()69 int core_count() const { return core_count_; } 70 71 // Returns the limit on the thread dimensionality values in each of the 72 // respective dimensions. These limits affect what constitutes a legitimate 73 // kernel launch request. thread_dim_limit()74 const ThreadDim &thread_dim_limit() const { return thread_dim_limit_; } 75 76 // Returns the limit on the block dimensionality values in each of the 77 // respective dimensions. These limits may affect what constitutes a 78 // legitimate kernel launch request. block_dim_limit()79 const BlockDim &block_dim_limit() const { return block_dim_limit_; } 80 81 // Returns the maximum number of simultaneously resident blocks 82 // on a multiprocessor. blocks_per_core_limit()83 int64 blocks_per_core_limit() const { return blocks_per_core_limit_; } 84 85 // Returns the limit on the total number of threads that can be launched in a 86 // single block; i.e. the limit on x * y * z dimensions of a ThreadDim. 87 // This limit affects what constitutes a legitimate kernel launch request. threads_per_block_limit()88 const int64 &threads_per_block_limit() const { 89 return threads_per_block_limit_; 90 } 91 92 // Returns the limit on the total number of threads that can be simultaneously 93 // launched on a given multiprocessor. threads_per_core_limit()94 const int64 &threads_per_core_limit() const { 95 return threads_per_core_limit_; 96 } 97 98 // Returns the number of threads per warp/wavefront. threads_per_warp()99 const int64 &threads_per_warp() const { return threads_per_warp_; } 100 101 // Returns the limit on the total number of registers per core. registers_per_core_limit()102 const int64 ®isters_per_core_limit() const { 103 return registers_per_core_limit_; 104 } 105 106 // Returns the limit on the total number of registers that can be 107 // simultaneously used by a block. registers_per_block_limit()108 const int64 ®isters_per_block_limit() const { 109 return registers_per_block_limit_; 110 } 111 112 // Returns the number of address bits available to kernel code running on the 113 // platform. This affects things like the maximum allocation size and perhaps 114 // types used in kernel code such as size_t. device_address_bits()115 const int64 &device_address_bits() const { return device_address_bits_; } 116 117 // Returns the device memory size in bytes. device_memory_size()118 int64 device_memory_size() const { return device_memory_size_; } 119 120 // Returns the device's memory bandwidth in bytes/sec. (This is for 121 // reads/writes to/from the device's own memory, not for transfers between the 122 // host and device.) memory_bandwidth()123 int64 memory_bandwidth() const { return memory_bandwidth_; } 124 125 // Returns the device's core clock rate in GHz. clock_rate_ghz()126 float clock_rate_ghz() const { return clock_rate_ghz_; } 127 128 // Returns whether ECC is enabled. ecc_enabled()129 bool ecc_enabled() const { return ecc_enabled_; } 130 131 // Returns the device vendor string, e.g., "NVIDIA Corporation", "Advanced 132 // Micro Devices, Inc.", or "GenuineIntel". device_vendor()133 const string &device_vendor() const { return device_vendor_; } 134 135 // Returns the CUDA compute capability if we're running on the CUDA platform. 136 // If a CUDA compute capability is not available, the major version will be 137 // zero, and the return value will be false. 138 bool cuda_compute_capability(int *major, int *minor) const; 139 140 // Returns the AMDGPU ISA version if we're running on the ROCm platform. 141 // If the information is not available, the version is not modified, 142 // and the return value will be false. 143 bool rocm_amdgpu_isa_version(int *version) const; 144 145 // Returns the maximum amount of shared memory present on a single core 146 // (i.e. Streaming Multiprocessor on NVIDIA GPUs; Compute Unit for OpenCL 147 // devices). Note that some devices, such as NVIDIA's have a configurable 148 // partitioning between shared memory and L1 cache. shared_memory_per_core()149 int64 shared_memory_per_core() const { return shared_memory_per_core_; } 150 151 // Returns the maximum amount of shared memory available for a single block. shared_memory_per_block()152 int64 shared_memory_per_block() const { return shared_memory_per_block_; } 153 154 // TODO(leary): resident blocks per core will be useful. 155 156 // Convenience typedef for the string-based DeviceDescription mapping. 157 typedef std::map<string, string> Map; 158 159 // Returns a mapping from readable names to readable values that describe the 160 // device. This is useful for things like printing. 161 std::unique_ptr<Map> ToMap() const; 162 163 // For string values that are not available via the underlying platform, this 164 // value will be provided. 165 static const char *kUndefinedString; 166 167 private: 168 friend class internal::DeviceDescriptionBuilder; 169 170 DeviceDescription(); 171 172 // For description of the following members, see the corresponding accessor 173 // above. 174 // 175 // N.B. If another field is added, update ToMap() above. 176 string device_vendor_; 177 string platform_version_; 178 string driver_version_; 179 string runtime_version_; 180 string pci_bus_id_; 181 string name_; 182 183 ThreadDim thread_dim_limit_; 184 BlockDim block_dim_limit_; 185 186 int64 blocks_per_core_limit_; 187 188 int64 threads_per_core_limit_; 189 int64 threads_per_block_limit_; 190 int64 threads_per_warp_; 191 192 int64 registers_per_core_limit_; 193 int64 registers_per_block_limit_; 194 195 int64 device_address_bits_; 196 int64 device_memory_size_; 197 int64 memory_bandwidth_; 198 199 // Shared memory limits on a given device. 200 int64 shared_memory_per_core_; 201 int64 shared_memory_per_block_; 202 203 float clock_rate_ghz_; 204 205 // CUDA "CC" major value, -1 if not available. 206 int cuda_compute_capability_major_; 207 int cuda_compute_capability_minor_; 208 209 // ROCM AMDGPU ISA version, 0 if not available. 210 int rocm_amdgpu_isa_version_; 211 212 int numa_node_; 213 int core_count_; 214 bool ecc_enabled_; 215 216 SE_DISALLOW_COPY_AND_ASSIGN(DeviceDescription); 217 }; 218 219 namespace internal { 220 221 // Helper class the builds a device description, given that it has a large 222 // number of fields that would be easily confused in constructor form. 223 class DeviceDescriptionBuilder { 224 public: 225 DeviceDescriptionBuilder(); 226 227 // For descriptions of the following fields, see comments on the corresponding 228 // DeviceDescription::* accessors above. 229 set_device_vendor(const string & value)230 void set_device_vendor(const string &value) { 231 device_description_->device_vendor_ = value; 232 } set_platform_version(const string & value)233 void set_platform_version(const string &value) { 234 device_description_->platform_version_ = value; 235 } set_driver_version(const string & value)236 void set_driver_version(const string &value) { 237 device_description_->driver_version_ = value; 238 } set_runtime_version(const string & value)239 void set_runtime_version(const string &value) { 240 device_description_->runtime_version_ = value; 241 } set_pci_bus_id(const string & value)242 void set_pci_bus_id(const string &value) { 243 device_description_->pci_bus_id_ = value; 244 } set_name(const string & value)245 void set_name(const string &value) { device_description_->name_ = value; } 246 set_thread_dim_limit(const ThreadDim & value)247 void set_thread_dim_limit(const ThreadDim &value) { 248 device_description_->thread_dim_limit_ = value; 249 } set_block_dim_limit(const BlockDim & value)250 void set_block_dim_limit(const BlockDim &value) { 251 device_description_->block_dim_limit_ = value; 252 } 253 set_blocks_per_core_limit(int64 value)254 void set_blocks_per_core_limit(int64 value) { 255 device_description_->blocks_per_core_limit_ = value; 256 } 257 set_threads_per_core_limit(int64 value)258 void set_threads_per_core_limit(int64 value) { 259 device_description_->threads_per_core_limit_ = value; 260 } set_threads_per_block_limit(int64 value)261 void set_threads_per_block_limit(int64 value) { 262 device_description_->threads_per_block_limit_ = value; 263 } set_threads_per_warp(int64 value)264 void set_threads_per_warp(int64 value) { 265 device_description_->threads_per_warp_ = value; 266 } 267 set_registers_per_core_limit(int64 value)268 void set_registers_per_core_limit(int64 value) { 269 device_description_->registers_per_core_limit_ = value; 270 } set_registers_per_block_limit(int64 value)271 void set_registers_per_block_limit(int64 value) { 272 device_description_->registers_per_block_limit_ = value; 273 } 274 set_device_address_bits(int64 value)275 void set_device_address_bits(int64 value) { 276 device_description_->device_address_bits_ = value; 277 } set_device_memory_size(int64 value)278 void set_device_memory_size(int64 value) { 279 device_description_->device_memory_size_ = value; 280 } set_memory_bandwidth(int64 value)281 void set_memory_bandwidth(int64 value) { 282 device_description_->memory_bandwidth_ = value; 283 } 284 set_shared_memory_per_core(int64 value)285 void set_shared_memory_per_core(int64 value) { 286 device_description_->shared_memory_per_core_ = value; 287 } set_shared_memory_per_block(int64 value)288 void set_shared_memory_per_block(int64 value) { 289 device_description_->shared_memory_per_block_ = value; 290 } 291 set_clock_rate_ghz(float value)292 void set_clock_rate_ghz(float value) { 293 device_description_->clock_rate_ghz_ = value; 294 } 295 set_cuda_compute_capability(int major,int minor)296 void set_cuda_compute_capability(int major, int minor) { 297 device_description_->cuda_compute_capability_major_ = major; 298 device_description_->cuda_compute_capability_minor_ = minor; 299 } 300 set_rocm_amdgpu_isa_version(int version)301 void set_rocm_amdgpu_isa_version(int version) { 302 device_description_->rocm_amdgpu_isa_version_ = version; 303 } 304 set_numa_node(int value)305 void set_numa_node(int value) { device_description_->numa_node_ = value; } set_core_count(int value)306 void set_core_count(int value) { device_description_->core_count_ = value; } set_ecc_enabled(bool value)307 void set_ecc_enabled(bool value) { 308 device_description_->ecc_enabled_ = value; 309 } 310 311 // Returns a built DeviceDescription with ownership transferred to the 312 // caller. There are currently no restrictions on which fields must be set in 313 // order to build the descriptor. 314 // 315 // Once the description is built, this builder object should be discarded. Build()316 std::unique_ptr<DeviceDescription> Build() { 317 return std::move(device_description_); 318 } 319 320 private: 321 std::unique_ptr<DeviceDescription> device_description_; 322 323 SE_DISALLOW_COPY_AND_ASSIGN(DeviceDescriptionBuilder); 324 }; 325 326 } // namespace internal 327 328 // Returns whether the given thread_dim is acceptable given the limits described 329 // in device_description. For detailed reasons for failing the predicate, enable 330 // VLOG(2) for this module. 331 bool ThreadDimOk(const DeviceDescription &device_description, 332 const ThreadDim &thread_dim); 333 334 // Equivalent to ceil(double(element_count) / threads_per_block). 335 ABSL_DEPRECATED("Use MathUtil::CeilOfRatio directly instead.") 336 int64 DivideCeil(int64 x, int64 y); 337 338 // Calculate the number of threads/blocks required to process element_count 339 // elements. Note that you can still end up with more threads than 340 // element_count due to rounding, so kernels often start with an "is this 341 // thread id in the element_count range?" test. 342 void CalculateDimensionality(const DeviceDescription &device_description, 343 int64 element_count, int64 *threads_per_block, 344 int64 *block_count); 345 346 } // namespace stream_executor 347 348 #endif // TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_ 349