1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // Describes the underlying platform for a StreamExecutor; e.g. OpenCL or CUDA 17 // device and platform properties. Also contains convenience functions for 18 // checking/calculating launch dimensionality based on device properties. 19 20 #ifndef TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_ 21 #define TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_ 22 23 #include <map> 24 #include <memory> 25 #include "absl/base/macros.h" 26 #include "tensorflow/stream_executor/launch_dim.h" 27 #include "tensorflow/stream_executor/platform/port.h" 28 29 namespace stream_executor { 30 namespace internal { 31 class DeviceDescriptionBuilder; 32 } // namespace internal 33 34 // Data that describes the execution target of the StreamExecutor, in terms of 35 // important logical parameters. These include dimensionality limits and 36 // physical parameters of interest, such as number of cores present on the 37 // device. 38 // 39 // Thread-safe: immutable post-initialization. 40 class DeviceDescription { 41 public: 42 // Returns the platform being run on; this value is primarily intended for 43 // printing, and comes out something like "OpenCL 1.2" or "Compute Capability 44 // 3.5". platform_version()45 const std::string &platform_version() const { return platform_version_; } 46 47 // Returns the driver version interfacing with the underlying platform. Vendor 48 // dependent format. driver_version()49 const std::string &driver_version() const { return driver_version_; } 50 51 // Return the runtime version, if one is provided by the underlying platform. 52 // Vendor dependent format / usefulness. runtime_version()53 const std::string &runtime_version() const { return runtime_version_; } 54 55 // Returns the name that the device reports. Vendor dependent. name()56 const std::string &name() const { return name_; } 57 58 // Returns the PCI bus identifier for this device, of the form 59 // [domain]:[bus]:[device].[function] pci_bus_id()60 const std::string &pci_bus_id() const { return pci_bus_id_; } 61 62 // Returns the NUMA node associated with this device, for use in 63 // determining socket locality. If the NUMA node could not be determined, -1 64 // is returned. numa_node()65 int numa_node() const { return numa_node_; } 66 67 // Number of cores (traditional notion of core; i.e. an SM on an NVIDIA device 68 // or an AMD Compute Unit. core_count()69 int core_count() const { return core_count_; } 70 71 // Returns the limit on the thread dimensionality values in each of the 72 // respective dimensions. These limits affect what constitutes a legitimate 73 // kernel launch request. thread_dim_limit()74 const ThreadDim &thread_dim_limit() const { return thread_dim_limit_; } 75 76 // Returns the limit on the block dimensionality values in each of the 77 // respective dimensions. These limits may affect what constitutes a 78 // legitimate kernel launch request. block_dim_limit()79 const BlockDim &block_dim_limit() const { return block_dim_limit_; } 80 81 // Returns the limit on the total number of threads that can be launched in a 82 // single block; i.e. the limit on x * y * z dimensions of a ThreadDim. 83 // This limit affects what constitutes a legitimate kernel launch request. threads_per_block_limit()84 const int64 &threads_per_block_limit() const { 85 return threads_per_block_limit_; 86 } 87 88 // Returns the limit on the total number of threads that can be simultaneously 89 // launched on a given multiprocessor. threads_per_core_limit()90 const int64 &threads_per_core_limit() const { 91 return threads_per_core_limit_; 92 } 93 94 // Returns the number of threads per warp/wavefront. threads_per_warp()95 const int64 &threads_per_warp() const { return threads_per_warp_; } 96 97 // Returns the limit on the total number of registers per core. registers_per_core_limit()98 const int64 ®isters_per_core_limit() const { 99 return registers_per_core_limit_; 100 } 101 102 // Returns the limit on the total number of registers that can be 103 // simultaneously used by a block. registers_per_block_limit()104 const int64 ®isters_per_block_limit() const { 105 return registers_per_block_limit_; 106 } 107 108 // Returns the number of address bits available to kernel code running on the 109 // platform. This affects things like the maximum allocation size and perhaps 110 // types used in kernel code such as size_t. device_address_bits()111 const int64 &device_address_bits() const { return device_address_bits_; } 112 113 // Returns the device memory size in bytes. device_memory_size()114 int64 device_memory_size() const { return device_memory_size_; } 115 116 // Returns the device's memory bandwidth in bytes/sec. (This is for 117 // reads/writes to/from the device's own memory, not for transfers between the 118 // host and device.) memory_bandwidth()119 int64 memory_bandwidth() const { return memory_bandwidth_; } 120 121 // Returns the device's core clock rate in GHz. clock_rate_ghz()122 float clock_rate_ghz() const { return clock_rate_ghz_; } 123 124 // Returns whether ECC is enabled. ecc_enabled()125 bool ecc_enabled() const { return ecc_enabled_; } 126 127 // Returns the device vendor string, e.g., "NVIDIA Corporation", "Advanced 128 // Micro Devices, Inc.", or "GenuineIntel". device_vendor()129 const std::string &device_vendor() const { return device_vendor_; } 130 131 // Returns the CUDA compute capability if we're running on the CUDA platform. 132 // If a CUDA compute capability is not available, the major version will be 133 // zero, and the return value will be false. 134 bool cuda_compute_capability(int *major, int *minor) const; 135 136 // Returns the AMDGPU ISA version if we're running on the ROCm platform. 137 // If the information is not available, the version is not modified, 138 // and the return value will be false. 139 bool rocm_amdgpu_isa_version(int *version) const; 140 141 // Returns the 142 // * AMDGPU GCN Architecture Name if we're running on the ROCm platform. 143 // * kUndefinedString otherwise rocm_amdgpu_gcn_arch_name()144 const std::string rocm_amdgpu_gcn_arch_name() const { 145 return rocm_amdgpu_gcn_arch_name_; 146 } 147 148 // Returns the maximum amount of shared memory present on a single core 149 // (i.e. Streaming Multiprocessor on NVIDIA GPUs; Compute Unit for OpenCL 150 // devices). Note that some devices, such as NVIDIA's have a configurable 151 // partitioning between shared memory and L1 cache. shared_memory_per_core()152 int64 shared_memory_per_core() const { return shared_memory_per_core_; } 153 154 // Returns the maximum amount of shared memory available for a single block. shared_memory_per_block()155 int64 shared_memory_per_block() const { return shared_memory_per_block_; } 156 157 // TODO(leary): resident blocks per core will be useful. 158 159 // Convenience typedef for the string-based DeviceDescription mapping. 160 typedef std::map<std::string, std::string> Map; 161 162 // Returns a mapping from readable names to readable values that describe the 163 // device. This is useful for things like printing. 164 std::unique_ptr<Map> ToMap() const; 165 166 // For string values that are not available via the underlying platform, this 167 // value will be provided. 168 static const char *kUndefinedString; 169 170 private: 171 friend class internal::DeviceDescriptionBuilder; 172 173 DeviceDescription(); 174 175 // For description of the following members, see the corresponding accessor 176 // above. 177 // 178 // N.B. If another field is added, update ToMap() above. 179 std::string device_vendor_; 180 std::string platform_version_; 181 std::string driver_version_; 182 std::string runtime_version_; 183 std::string pci_bus_id_; 184 std::string name_; 185 186 ThreadDim thread_dim_limit_; 187 BlockDim block_dim_limit_; 188 189 int64 threads_per_core_limit_; 190 int64 threads_per_block_limit_; 191 int64 threads_per_warp_; 192 193 int64 registers_per_core_limit_; 194 int64 registers_per_block_limit_; 195 196 int64 device_address_bits_; 197 int64 device_memory_size_; 198 int64 memory_bandwidth_; 199 200 // Shared memory limits on a given device. 201 int64 shared_memory_per_core_; 202 int64 shared_memory_per_block_; 203 204 float clock_rate_ghz_; 205 206 // CUDA "CC" major value, -1 if not available. 207 int cuda_compute_capability_major_; 208 int cuda_compute_capability_minor_; 209 210 // ROCM AMDGPU ISA version, 0 if not available. 211 int rocm_amdgpu_isa_version_; 212 213 // ROCm AMDGPU GCN Architecture name, "" if not available. 214 std::string rocm_amdgpu_gcn_arch_name_; 215 216 int numa_node_; 217 int core_count_; 218 bool ecc_enabled_; 219 220 SE_DISALLOW_COPY_AND_ASSIGN(DeviceDescription); 221 }; 222 223 namespace internal { 224 225 // Helper class the builds a device description, given that it has a large 226 // number of fields that would be easily confused in constructor form. 227 class DeviceDescriptionBuilder { 228 public: 229 DeviceDescriptionBuilder(); 230 231 // For descriptions of the following fields, see comments on the corresponding 232 // DeviceDescription::* accessors above. 233 set_device_vendor(const std::string & value)234 void set_device_vendor(const std::string &value) { 235 device_description_->device_vendor_ = value; 236 } set_platform_version(const std::string & value)237 void set_platform_version(const std::string &value) { 238 device_description_->platform_version_ = value; 239 } set_driver_version(const std::string & value)240 void set_driver_version(const std::string &value) { 241 device_description_->driver_version_ = value; 242 } set_runtime_version(const std::string & value)243 void set_runtime_version(const std::string &value) { 244 device_description_->runtime_version_ = value; 245 } set_pci_bus_id(const std::string & value)246 void set_pci_bus_id(const std::string &value) { 247 device_description_->pci_bus_id_ = value; 248 } set_name(const std::string & value)249 void set_name(const std::string &value) { 250 device_description_->name_ = value; 251 } 252 set_thread_dim_limit(const ThreadDim & value)253 void set_thread_dim_limit(const ThreadDim &value) { 254 device_description_->thread_dim_limit_ = value; 255 } set_block_dim_limit(const BlockDim & value)256 void set_block_dim_limit(const BlockDim &value) { 257 device_description_->block_dim_limit_ = value; 258 } 259 set_threads_per_core_limit(int64 value)260 void set_threads_per_core_limit(int64 value) { 261 device_description_->threads_per_core_limit_ = value; 262 } set_threads_per_block_limit(int64 value)263 void set_threads_per_block_limit(int64 value) { 264 device_description_->threads_per_block_limit_ = value; 265 } set_threads_per_warp(int64 value)266 void set_threads_per_warp(int64 value) { 267 device_description_->threads_per_warp_ = value; 268 } 269 set_registers_per_core_limit(int64 value)270 void set_registers_per_core_limit(int64 value) { 271 device_description_->registers_per_core_limit_ = value; 272 } set_registers_per_block_limit(int64 value)273 void set_registers_per_block_limit(int64 value) { 274 device_description_->registers_per_block_limit_ = value; 275 } 276 set_device_address_bits(int64 value)277 void set_device_address_bits(int64 value) { 278 device_description_->device_address_bits_ = value; 279 } set_device_memory_size(int64 value)280 void set_device_memory_size(int64 value) { 281 device_description_->device_memory_size_ = value; 282 } set_memory_bandwidth(int64 value)283 void set_memory_bandwidth(int64 value) { 284 device_description_->memory_bandwidth_ = value; 285 } 286 set_shared_memory_per_core(int64 value)287 void set_shared_memory_per_core(int64 value) { 288 device_description_->shared_memory_per_core_ = value; 289 } set_shared_memory_per_block(int64 value)290 void set_shared_memory_per_block(int64 value) { 291 device_description_->shared_memory_per_block_ = value; 292 } 293 set_clock_rate_ghz(float value)294 void set_clock_rate_ghz(float value) { 295 device_description_->clock_rate_ghz_ = value; 296 } 297 set_cuda_compute_capability(int major,int minor)298 void set_cuda_compute_capability(int major, int minor) { 299 device_description_->cuda_compute_capability_major_ = major; 300 device_description_->cuda_compute_capability_minor_ = minor; 301 } 302 set_rocm_amdgpu_isa_version(int version)303 void set_rocm_amdgpu_isa_version(int version) { 304 device_description_->rocm_amdgpu_isa_version_ = version; 305 } 306 set_rocm_amdgpu_gcn_arch_name(const std::string & gcn_arch_name)307 void set_rocm_amdgpu_gcn_arch_name(const std::string &gcn_arch_name) { 308 device_description_->rocm_amdgpu_gcn_arch_name_ = gcn_arch_name; 309 } 310 set_numa_node(int value)311 void set_numa_node(int value) { device_description_->numa_node_ = value; } set_core_count(int value)312 void set_core_count(int value) { device_description_->core_count_ = value; } set_ecc_enabled(bool value)313 void set_ecc_enabled(bool value) { 314 device_description_->ecc_enabled_ = value; 315 } 316 317 // Returns a built DeviceDescription with ownership transferred to the 318 // caller. There are currently no restrictions on which fields must be set in 319 // order to build the descriptor. 320 // 321 // Once the description is built, this builder object should be discarded. Build()322 std::unique_ptr<DeviceDescription> Build() { 323 return std::move(device_description_); 324 } 325 326 private: 327 std::unique_ptr<DeviceDescription> device_description_; 328 329 SE_DISALLOW_COPY_AND_ASSIGN(DeviceDescriptionBuilder); 330 }; 331 332 } // namespace internal 333 334 // Returns whether the given thread_dim is acceptable given the limits described 335 // in device_description. For detailed reasons for failing the predicate, enable 336 // VLOG(2) for this module. 337 bool ThreadDimOk(const DeviceDescription &device_description, 338 const ThreadDim &thread_dim); 339 340 // Equivalent to ceil(double(element_count) / threads_per_block). 341 ABSL_DEPRECATED("Use MathUtil::CeilOfRatio directly instead.") 342 int64 DivideCeil(int64 x, int64 y); 343 344 // Calculate the number of threads/blocks required to process element_count 345 // elements. Note that you can still end up with more threads than 346 // element_count due to rounding, so kernels often start with an "is this 347 // thread id in the element_count range?" test. 348 void CalculateDimensionality(const DeviceDescription &device_description, 349 int64 element_count, int64 *threads_per_block, 350 int64 *block_count); 351 352 } // namespace stream_executor 353 354 #endif // TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_ 355