• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/stream_executor/device_description.h"
17 
18 #include <algorithm>
19 
20 #include "absl/strings/str_cat.h"
21 #include "tensorflow/stream_executor/lib/human_readable.h"
22 #include "tensorflow/stream_executor/lib/mathutil.h"
23 
24 namespace stream_executor {
25 
26 static const uint64 kUninitializedUint64 = -1ULL;
27 /* static */ const char *DeviceDescription::kUndefinedString = "<undefined>";
28 
DeviceDescription()29 DeviceDescription::DeviceDescription()
30     : device_vendor_(kUndefinedString),
31       platform_version_(kUndefinedString),
32       driver_version_(kUndefinedString),
33       runtime_version_(kUndefinedString),
34       pci_bus_id_(kUndefinedString),
35       name_(kUndefinedString),
36       thread_dim_limit_(kUninitializedUint64, kUninitializedUint64,
37                         kUninitializedUint64),
38       block_dim_limit_(kUninitializedUint64, kUninitializedUint64,
39                        kUninitializedUint64),
40       threads_per_core_limit_(kUninitializedUint64),
41       threads_per_block_limit_(kUninitializedUint64),
42       threads_per_warp_(kUninitializedUint64),
43       registers_per_core_limit_(kUninitializedUint64),
44       registers_per_block_limit_(kUninitializedUint64),
45       device_address_bits_(kUninitializedUint64),
46       device_memory_size_(kUninitializedUint64),
47       memory_bandwidth_(kUninitializedUint64),
48       shared_memory_per_core_(kUninitializedUint64),
49       shared_memory_per_block_(kUninitializedUint64),
50       clock_rate_ghz_(-1.0),
51       rocm_amdgpu_isa_version_(-1),
52       rocm_amdgpu_gcn_arch_name_(kUndefinedString),
53       numa_node_(-1),
54       core_count_(-1),
55       ecc_enabled_(false) {}
56 
ToMap() const57 std::unique_ptr<std::map<std::string, std::string>> DeviceDescription::ToMap()
58     const {
59   std::unique_ptr<std::map<std::string, std::string>> owned_result{
60       new std::map<std::string, std::string>};
61   std::map<std::string, std::string> &result = *owned_result;
62   result["Device Vendor"] = device_vendor();
63   result["Platform Version"] = platform_version();
64   result["Driver Version"] = driver_version();
65   result["Runtime Version"] = runtime_version();
66   result["PCI bus ID"] = pci_bus_id_;
67   result["Device Name"] = name_;
68 
69   const ThreadDim &thread_dim = thread_dim_limit();
70   result["ThreadDim Limit"] =
71       absl::StrCat(thread_dim.x, ",", thread_dim.y, ",", thread_dim.z);
72   const BlockDim &block_dim = block_dim_limit();
73   result["BlockDim Limit"] =
74       absl::StrCat(block_dim.x, ",", block_dim.y, ",", block_dim.z);
75 
76   result["Threads Per Core Limit"] = absl::StrCat(threads_per_core_limit());
77   result["Threads Per Block Limit"] = absl::StrCat(threads_per_block_limit());
78   result["Registers Per Block Limit"] =
79       absl::StrCat(registers_per_block_limit());
80 
81   result["Device Address Bits"] = absl::StrCat(device_address_bits());
82   result["Device Memory Size"] =
83       port::HumanReadableNumBytes::ToString(device_memory_size());
84   result["Memory Bandwidth"] = absl::StrCat(
85       port::HumanReadableNumBytes::ToString(memory_bandwidth_), "/s");
86 
87   result["Shared Memory Per Core"] =
88       port::HumanReadableNumBytes::ToString(shared_memory_per_core_);
89   result["Shared Memory Per Block"] =
90       port::HumanReadableNumBytes::ToString(shared_memory_per_block_);
91 
92   result["Clock Rate GHz"] = absl::StrCat(clock_rate_ghz());
93 
94   result["CUDA Compute Capability"] = cuda_compute_capability().ToString();
95 
96   result["AMDGPU GCN Arch Name"] = rocm_amdgpu_gcn_arch_name_;
97 
98   result["NUMA Node"] = absl::StrCat(numa_node());
99   result["Core Count"] = absl::StrCat(core_count());
100   result["ECC Enabled"] = absl::StrCat(ecc_enabled());
101   return owned_result;
102 }
103 
104 namespace internal {
105 
DeviceDescriptionBuilder()106 DeviceDescriptionBuilder::DeviceDescriptionBuilder()
107     : device_description_(new DeviceDescription) {}
108 
109 }  // namespace internal
110 
cuda_compute_capability() const111 CudaComputeCapability DeviceDescription::cuda_compute_capability() const {
112   return cuda_compute_capability_;
113 }
114 
rocm_amdgpu_isa_version(int * version) const115 bool DeviceDescription::rocm_amdgpu_isa_version(int *version) const {
116   bool status = false;
117   if (rocm_amdgpu_isa_version_ > 0) {
118     *version = rocm_amdgpu_isa_version_;
119     status = true;
120   }
121   return status;
122 }
123 
ThreadDimOk(const DeviceDescription & device_description,const ThreadDim & thread_dim)124 bool ThreadDimOk(const DeviceDescription &device_description,
125                  const ThreadDim &thread_dim) {
126   const int64_t total_threads = thread_dim.x * thread_dim.y * thread_dim.z;
127   const int64_t threads_per_block_limit =
128       device_description.threads_per_block_limit();
129   if (total_threads > threads_per_block_limit) {
130     VLOG(2) << "exceeded total-thread-per-block limit: " << total_threads
131             << " vs limit " << threads_per_block_limit;
132     return false;
133   }
134 
135   const auto &limit = device_description.thread_dim_limit();
136   bool ok = thread_dim.x <= limit.x && thread_dim.y <= limit.y &&
137             thread_dim.z <= limit.z;
138   if (!ok) {
139     VLOG(2) << "thread dim " << thread_dim.ToString()
140             << " exceeds limit constraints of " << limit.ToString();
141   }
142   return ok;
143 }
144 
DivideCeil(uint64 x,uint64 y)145 uint64 DivideCeil(uint64 x, uint64 y) {
146   return port::MathUtil::CeilOfRatio(x, y);
147 }
148 
CalculateDimensionality(const DeviceDescription & device_description,int64_t element_count,int64 * threads_per_block,int64 * block_count)149 void CalculateDimensionality(const DeviceDescription &device_description,
150                              int64_t element_count, int64 *threads_per_block,
151                              int64 *block_count) {
152   *threads_per_block = device_description.threads_per_block_limit();
153   *block_count = port::MathUtil::CeilOfRatio(element_count, *threads_per_block);
154   if (*block_count == 1) {
155     CHECK_LE(element_count, *threads_per_block);
156     *threads_per_block = element_count;
157   }
158 }
159 
160 }  // namespace stream_executor
161