1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // Describes the underlying platform for a StreamExecutor; e.g. OpenCL or CUDA 17 // device and platform properties. Also contains convenience functions for 18 // checking/calculating launch dimensionality based on device properties. 19 20 #ifndef TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_ 21 #define TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_ 22 23 #include <map> 24 #include <memory> 25 26 #include "absl/base/macros.h" 27 #include "tensorflow/stream_executor/launch_dim.h" 28 #include "tensorflow/stream_executor/lib/statusor.h" 29 #include "tensorflow/stream_executor/platform/port.h" 30 31 namespace stream_executor { 32 namespace internal { 33 class DeviceDescriptionBuilder; 34 } // namespace internal 35 36 // CUDA compute capability, as reported by the device description. 37 struct CudaComputeCapability { 38 int major = 0; 39 int minor = 0; 40 41 // MSVC does not like "PASCAL" symbol. 42 enum CudaComputeCapabilities { PASCAL_ = 6, VOLTA = 7, AMPERE = 8 }; 43 CudaComputeCapabilityCudaComputeCapability44 CudaComputeCapability() {} CudaComputeCapabilityCudaComputeCapability45 CudaComputeCapability(int major, int minor) { 46 this->major = major; 47 this->minor = minor; 48 } 49 50 bool IsAtLeast(int other_major, int other_minor = 0) const { 51 return !(*this < CudaComputeCapability{other_major, other_minor}); 52 } 53 54 bool operator<(const CudaComputeCapability &other) const { 55 return ToPair() < other.ToPair(); 56 } 57 58 bool operator==(const CudaComputeCapability &other) const { 59 return ToPair() == other.ToPair(); 60 } 61 62 bool operator!=(const CudaComputeCapability &other) const { 63 return !(*this == other); 64 } 65 ToStringCudaComputeCapability66 std::string ToString() const { return absl::StrCat(major, ".", minor); } 67 ToPairCudaComputeCapability68 std::pair<int, int> ToPair() const { return std::make_pair(major, minor); } 69 }; 70 71 // Data that describes the execution target of the StreamExecutor, in terms of 72 // important logical parameters. These include dimensionality limits and 73 // physical parameters of interest, such as number of cores present on the 74 // device. 75 // 76 // Thread-safe: immutable post-initialization. 77 class DeviceDescription { 78 public: 79 // Returns the platform being run on; this value is primarily intended for 80 // printing, and comes out something like "OpenCL 1.2" or "Compute Capability 81 // 3.5". platform_version()82 const std::string &platform_version() const { return platform_version_; } 83 84 // Returns the driver version interfacing with the underlying platform. Vendor 85 // dependent format. driver_version()86 const std::string &driver_version() const { return driver_version_; } 87 88 // Return the runtime version, if one is provided by the underlying platform. 89 // Vendor dependent format / usefulness. runtime_version()90 const std::string &runtime_version() const { return runtime_version_; } 91 92 // Returns the name that the device reports. Vendor dependent. name()93 const std::string &name() const { return name_; } 94 95 // Returns the PCI bus identifier for this device, of the form 96 // [domain]:[bus]:[device].[function] pci_bus_id()97 const std::string &pci_bus_id() const { return pci_bus_id_; } 98 99 // Returns the NUMA node associated with this device, for use in 100 // determining socket locality. If the NUMA node could not be determined, -1 101 // is returned. numa_node()102 int numa_node() const { return numa_node_; } 103 104 // Number of cores (traditional notion of core; i.e. an SM on an NVIDIA device 105 // or an AMD Compute Unit. core_count()106 int core_count() const { return core_count_; } 107 108 // Returns the limit on the thread dimensionality values in each of the 109 // respective dimensions. These limits affect what constitutes a legitimate 110 // kernel launch request. thread_dim_limit()111 const ThreadDim &thread_dim_limit() const { return thread_dim_limit_; } 112 113 // Returns the limit on the block dimensionality values in each of the 114 // respective dimensions. These limits may affect what constitutes a 115 // legitimate kernel launch request. block_dim_limit()116 const BlockDim &block_dim_limit() const { return block_dim_limit_; } 117 118 // Returns the limit on the total number of threads that can be launched in a 119 // single block; i.e. the limit on x * y * z dimensions of a ThreadDim. 120 // This limit affects what constitutes a legitimate kernel launch request. threads_per_block_limit()121 const int64 &threads_per_block_limit() const { 122 return threads_per_block_limit_; 123 } 124 125 // Returns the limit on the total number of threads that can be simultaneously 126 // launched on a given multiprocessor. threads_per_core_limit()127 const int64 &threads_per_core_limit() const { 128 return threads_per_core_limit_; 129 } 130 131 // Returns the number of threads per warp/wavefront. threads_per_warp()132 const int64 &threads_per_warp() const { return threads_per_warp_; } 133 134 // Returns the limit on the total number of registers per core. registers_per_core_limit()135 const int64 ®isters_per_core_limit() const { 136 return registers_per_core_limit_; 137 } 138 139 // Returns the limit on the total number of registers that can be 140 // simultaneously used by a block. registers_per_block_limit()141 const int64 ®isters_per_block_limit() const { 142 return registers_per_block_limit_; 143 } 144 145 // Returns the number of address bits available to kernel code running on the 146 // platform. This affects things like the maximum allocation size and perhaps 147 // types used in kernel code such as size_t. device_address_bits()148 const int64 &device_address_bits() const { return device_address_bits_; } 149 150 // Returns the device memory size in bytes. device_memory_size()151 int64 device_memory_size() const { return device_memory_size_; } 152 153 // Returns the device's memory bandwidth in bytes/sec. (This is for 154 // reads/writes to/from the device's own memory, not for transfers between the 155 // host and device.) memory_bandwidth()156 int64 memory_bandwidth() const { return memory_bandwidth_; } 157 158 // Returns the device's core clock rate in GHz. clock_rate_ghz()159 float clock_rate_ghz() const { return clock_rate_ghz_; } 160 161 // Returns whether ECC is enabled. ecc_enabled()162 bool ecc_enabled() const { return ecc_enabled_; } 163 164 // Returns the device vendor string, e.g., "NVIDIA Corporation", "Advanced 165 // Micro Devices, Inc.", or "GenuineIntel". device_vendor()166 const std::string &device_vendor() const { return device_vendor_; } 167 168 // Returns the CUDA compute capability if we're running on the CUDA platform. 169 // If a CUDA compute capability is not available, the major version will be 170 // zero. 171 CudaComputeCapability cuda_compute_capability() const; 172 173 // Returns the AMDGPU ISA version if we're running on the ROCm platform. 174 // If the information is not available, the version is not modified, 175 // and the return value will be false. 176 bool rocm_amdgpu_isa_version(int *version) const; 177 178 // Returns the 179 // * AMDGPU GCN Architecture Name if we're running on the ROCm platform. 180 // * kUndefinedString otherwise rocm_amdgpu_gcn_arch_name()181 const std::string rocm_amdgpu_gcn_arch_name() const { 182 return rocm_amdgpu_gcn_arch_name_; 183 } 184 185 // Returns the maximum amount of shared memory present on a single core 186 // (i.e. Streaming Multiprocessor on NVIDIA GPUs; Compute Unit for OpenCL 187 // devices). Note that some devices, such as NVIDIA's have a configurable 188 // partitioning between shared memory and L1 cache. shared_memory_per_core()189 int64 shared_memory_per_core() const { return shared_memory_per_core_; } 190 191 // Returns the maximum amount of shared memory available for a single block. shared_memory_per_block()192 int64 shared_memory_per_block() const { return shared_memory_per_block_; } 193 194 // TODO(leary): resident blocks per core will be useful. 195 196 // Convenience typedef for the string-based DeviceDescription mapping. 197 typedef std::map<std::string, std::string> Map; 198 199 // Returns a mapping from readable names to readable values that describe the 200 // device. This is useful for things like printing. 201 std::unique_ptr<Map> ToMap() const; 202 203 // For string values that are not available via the underlying platform, this 204 // value will be provided. 205 static const char *kUndefinedString; 206 207 private: 208 friend class internal::DeviceDescriptionBuilder; 209 210 DeviceDescription(); 211 212 // For description of the following members, see the corresponding accessor 213 // above. 214 // 215 // N.B. If another field is added, update ToMap() above. 216 std::string device_vendor_; 217 std::string platform_version_; 218 std::string driver_version_; 219 std::string runtime_version_; 220 std::string pci_bus_id_; 221 std::string name_; 222 223 ThreadDim thread_dim_limit_; 224 BlockDim block_dim_limit_; 225 226 int64 threads_per_core_limit_; 227 int64 threads_per_block_limit_; 228 int64 threads_per_warp_; 229 230 int64 registers_per_core_limit_; 231 int64 registers_per_block_limit_; 232 233 int64 device_address_bits_; 234 int64 device_memory_size_; 235 int64 memory_bandwidth_; 236 237 // Shared memory limits on a given device. 238 int64 shared_memory_per_core_; 239 int64 shared_memory_per_block_; 240 241 float clock_rate_ghz_; 242 243 // CUDA "CC" major value, -1 if not available. 244 CudaComputeCapability cuda_compute_capability_{-1, -1}; 245 246 // ROCM AMDGPU ISA version, 0 if not available. 247 int rocm_amdgpu_isa_version_; 248 249 // ROCm AMDGPU GCN Architecture name, "" if not available. 250 std::string rocm_amdgpu_gcn_arch_name_; 251 252 int numa_node_; 253 int core_count_; 254 bool ecc_enabled_; 255 256 SE_DISALLOW_COPY_AND_ASSIGN(DeviceDescription); 257 }; 258 259 namespace internal { 260 261 // Helper class the builds a device description, given that it has a large 262 // number of fields that would be easily confused in constructor form. 263 class DeviceDescriptionBuilder { 264 public: 265 DeviceDescriptionBuilder(); 266 267 // For descriptions of the following fields, see comments on the corresponding 268 // DeviceDescription::* accessors above. 269 set_device_vendor(const std::string & value)270 void set_device_vendor(const std::string &value) { 271 device_description_->device_vendor_ = value; 272 } set_platform_version(const std::string & value)273 void set_platform_version(const std::string &value) { 274 device_description_->platform_version_ = value; 275 } set_driver_version(const std::string & value)276 void set_driver_version(const std::string &value) { 277 device_description_->driver_version_ = value; 278 } set_runtime_version(const std::string & value)279 void set_runtime_version(const std::string &value) { 280 device_description_->runtime_version_ = value; 281 } set_pci_bus_id(const std::string & value)282 void set_pci_bus_id(const std::string &value) { 283 device_description_->pci_bus_id_ = value; 284 } set_name(const std::string & value)285 void set_name(const std::string &value) { 286 device_description_->name_ = value; 287 } 288 set_thread_dim_limit(const ThreadDim & value)289 void set_thread_dim_limit(const ThreadDim &value) { 290 device_description_->thread_dim_limit_ = value; 291 } set_block_dim_limit(const BlockDim & value)292 void set_block_dim_limit(const BlockDim &value) { 293 device_description_->block_dim_limit_ = value; 294 } 295 set_threads_per_core_limit(int64_t value)296 void set_threads_per_core_limit(int64_t value) { 297 device_description_->threads_per_core_limit_ = value; 298 } set_threads_per_block_limit(int64_t value)299 void set_threads_per_block_limit(int64_t value) { 300 device_description_->threads_per_block_limit_ = value; 301 } set_threads_per_warp(int64_t value)302 void set_threads_per_warp(int64_t value) { 303 device_description_->threads_per_warp_ = value; 304 } 305 set_registers_per_core_limit(int64_t value)306 void set_registers_per_core_limit(int64_t value) { 307 device_description_->registers_per_core_limit_ = value; 308 } set_registers_per_block_limit(int64_t value)309 void set_registers_per_block_limit(int64_t value) { 310 device_description_->registers_per_block_limit_ = value; 311 } 312 set_device_address_bits(int64_t value)313 void set_device_address_bits(int64_t value) { 314 device_description_->device_address_bits_ = value; 315 } set_device_memory_size(int64_t value)316 void set_device_memory_size(int64_t value) { 317 device_description_->device_memory_size_ = value; 318 } set_memory_bandwidth(int64_t value)319 void set_memory_bandwidth(int64_t value) { 320 device_description_->memory_bandwidth_ = value; 321 } 322 set_shared_memory_per_core(int64_t value)323 void set_shared_memory_per_core(int64_t value) { 324 device_description_->shared_memory_per_core_ = value; 325 } set_shared_memory_per_block(int64_t value)326 void set_shared_memory_per_block(int64_t value) { 327 device_description_->shared_memory_per_block_ = value; 328 } 329 set_clock_rate_ghz(float value)330 void set_clock_rate_ghz(float value) { 331 device_description_->clock_rate_ghz_ = value; 332 } 333 set_cuda_compute_capability(int major,int minor)334 void set_cuda_compute_capability(int major, int minor) { 335 device_description_->cuda_compute_capability_ = 336 CudaComputeCapability{major, minor}; 337 } 338 set_rocm_amdgpu_isa_version(int version)339 void set_rocm_amdgpu_isa_version(int version) { 340 device_description_->rocm_amdgpu_isa_version_ = version; 341 } 342 set_rocm_amdgpu_gcn_arch_name(const std::string & gcn_arch_name)343 void set_rocm_amdgpu_gcn_arch_name(const std::string &gcn_arch_name) { 344 device_description_->rocm_amdgpu_gcn_arch_name_ = gcn_arch_name; 345 } 346 set_numa_node(int value)347 void set_numa_node(int value) { device_description_->numa_node_ = value; } set_core_count(int value)348 void set_core_count(int value) { device_description_->core_count_ = value; } set_ecc_enabled(bool value)349 void set_ecc_enabled(bool value) { 350 device_description_->ecc_enabled_ = value; 351 } 352 353 // Returns a built DeviceDescription with ownership transferred to the 354 // caller. There are currently no restrictions on which fields must be set in 355 // order to build the descriptor. 356 // 357 // Once the description is built, this builder object should be discarded. Build()358 std::unique_ptr<DeviceDescription> Build() { 359 return std::move(device_description_); 360 } 361 362 private: 363 std::unique_ptr<DeviceDescription> device_description_; 364 365 SE_DISALLOW_COPY_AND_ASSIGN(DeviceDescriptionBuilder); 366 }; 367 368 } // namespace internal 369 370 // Returns whether the given thread_dim is acceptable given the limits described 371 // in device_description. For detailed reasons for failing the predicate, enable 372 // VLOG(2) for this module. 373 bool ThreadDimOk(const DeviceDescription &device_description, 374 const ThreadDim &thread_dim); 375 376 // Equivalent to ceil(double(element_count) / threads_per_block). 377 ABSL_DEPRECATED("Use MathUtil::CeilOfRatio directly instead.") 378 int64 DivideCeil(int64_t x, int64_t y); 379 380 // Calculate the number of threads/blocks required to process element_count 381 // elements. Note that you can still end up with more threads than 382 // element_count due to rounding, so kernels often start with an "is this 383 // thread id in the element_count range?" test. 384 void CalculateDimensionality(const DeviceDescription &device_description, 385 int64_t element_count, int64 *threads_per_block, 386 int64 *block_count); 387 388 } // namespace stream_executor 389 390 #endif // TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_ 391