1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/stream_executor/device_description.h"
17
18 #include <algorithm>
19
20 #include "absl/strings/str_cat.h"
21 #include "tensorflow/stream_executor/lib/human_readable.h"
22 #include "tensorflow/stream_executor/lib/mathutil.h"
23
24 namespace stream_executor {
25
26 static const uint64 kUninitializedUint64 = -1ULL;
27 /* static */ const char *DeviceDescription::kUndefinedString = "<undefined>";
28
DeviceDescription()29 DeviceDescription::DeviceDescription()
30 : device_vendor_(kUndefinedString),
31 platform_version_(kUndefinedString),
32 driver_version_(kUndefinedString),
33 runtime_version_(kUndefinedString),
34 pci_bus_id_(kUndefinedString),
35 name_(kUndefinedString),
36 thread_dim_limit_(kUninitializedUint64, kUninitializedUint64,
37 kUninitializedUint64),
38 block_dim_limit_(kUninitializedUint64, kUninitializedUint64,
39 kUninitializedUint64),
40 threads_per_core_limit_(kUninitializedUint64),
41 threads_per_block_limit_(kUninitializedUint64),
42 threads_per_warp_(kUninitializedUint64),
43 registers_per_core_limit_(kUninitializedUint64),
44 registers_per_block_limit_(kUninitializedUint64),
45 device_address_bits_(kUninitializedUint64),
46 device_memory_size_(kUninitializedUint64),
47 memory_bandwidth_(kUninitializedUint64),
48 shared_memory_per_core_(kUninitializedUint64),
49 shared_memory_per_block_(kUninitializedUint64),
50 clock_rate_ghz_(-1.0),
51 rocm_amdgpu_isa_version_(-1),
52 rocm_amdgpu_gcn_arch_name_(kUndefinedString),
53 numa_node_(-1),
54 core_count_(-1),
55 ecc_enabled_(false) {}
56
ToMap() const57 std::unique_ptr<std::map<std::string, std::string>> DeviceDescription::ToMap()
58 const {
59 std::unique_ptr<std::map<std::string, std::string>> owned_result{
60 new std::map<std::string, std::string>};
61 std::map<std::string, std::string> &result = *owned_result;
62 result["Device Vendor"] = device_vendor();
63 result["Platform Version"] = platform_version();
64 result["Driver Version"] = driver_version();
65 result["Runtime Version"] = runtime_version();
66 result["PCI bus ID"] = pci_bus_id_;
67 result["Device Name"] = name_;
68
69 const ThreadDim &thread_dim = thread_dim_limit();
70 result["ThreadDim Limit"] =
71 absl::StrCat(thread_dim.x, ",", thread_dim.y, ",", thread_dim.z);
72 const BlockDim &block_dim = block_dim_limit();
73 result["BlockDim Limit"] =
74 absl::StrCat(block_dim.x, ",", block_dim.y, ",", block_dim.z);
75
76 result["Threads Per Core Limit"] = absl::StrCat(threads_per_core_limit());
77 result["Threads Per Block Limit"] = absl::StrCat(threads_per_block_limit());
78 result["Registers Per Block Limit"] =
79 absl::StrCat(registers_per_block_limit());
80
81 result["Device Address Bits"] = absl::StrCat(device_address_bits());
82 result["Device Memory Size"] =
83 port::HumanReadableNumBytes::ToString(device_memory_size());
84 result["Memory Bandwidth"] = absl::StrCat(
85 port::HumanReadableNumBytes::ToString(memory_bandwidth_), "/s");
86
87 result["Shared Memory Per Core"] =
88 port::HumanReadableNumBytes::ToString(shared_memory_per_core_);
89 result["Shared Memory Per Block"] =
90 port::HumanReadableNumBytes::ToString(shared_memory_per_block_);
91
92 result["Clock Rate GHz"] = absl::StrCat(clock_rate_ghz());
93
94 result["CUDA Compute Capability"] = cuda_compute_capability().ToString();
95
96 result["AMDGPU GCN Arch Name"] = rocm_amdgpu_gcn_arch_name_;
97
98 result["NUMA Node"] = absl::StrCat(numa_node());
99 result["Core Count"] = absl::StrCat(core_count());
100 result["ECC Enabled"] = absl::StrCat(ecc_enabled());
101 return owned_result;
102 }
103
104 namespace internal {
105
DeviceDescriptionBuilder()106 DeviceDescriptionBuilder::DeviceDescriptionBuilder()
107 : device_description_(new DeviceDescription) {}
108
109 } // namespace internal
110
cuda_compute_capability() const111 CudaComputeCapability DeviceDescription::cuda_compute_capability() const {
112 return cuda_compute_capability_;
113 }
114
rocm_amdgpu_isa_version(int * version) const115 bool DeviceDescription::rocm_amdgpu_isa_version(int *version) const {
116 bool status = false;
117 if (rocm_amdgpu_isa_version_ > 0) {
118 *version = rocm_amdgpu_isa_version_;
119 status = true;
120 }
121 return status;
122 }
123
ThreadDimOk(const DeviceDescription & device_description,const ThreadDim & thread_dim)124 bool ThreadDimOk(const DeviceDescription &device_description,
125 const ThreadDim &thread_dim) {
126 const int64_t total_threads = thread_dim.x * thread_dim.y * thread_dim.z;
127 const int64_t threads_per_block_limit =
128 device_description.threads_per_block_limit();
129 if (total_threads > threads_per_block_limit) {
130 VLOG(2) << "exceeded total-thread-per-block limit: " << total_threads
131 << " vs limit " << threads_per_block_limit;
132 return false;
133 }
134
135 const auto &limit = device_description.thread_dim_limit();
136 bool ok = thread_dim.x <= limit.x && thread_dim.y <= limit.y &&
137 thread_dim.z <= limit.z;
138 if (!ok) {
139 VLOG(2) << "thread dim " << thread_dim.ToString()
140 << " exceeds limit constraints of " << limit.ToString();
141 }
142 return ok;
143 }
144
DivideCeil(uint64 x,uint64 y)145 uint64 DivideCeil(uint64 x, uint64 y) {
146 return port::MathUtil::CeilOfRatio(x, y);
147 }
148
CalculateDimensionality(const DeviceDescription & device_description,int64_t element_count,int64 * threads_per_block,int64 * block_count)149 void CalculateDimensionality(const DeviceDescription &device_description,
150 int64_t element_count, int64 *threads_per_block,
151 int64 *block_count) {
152 *threads_per_block = device_description.threads_per_block_limit();
153 *block_count = port::MathUtil::CeilOfRatio(element_count, *threads_per_block);
154 if (*block_count == 1) {
155 CHECK_LE(element_count, *threads_per_block);
156 *threads_per_block = element_count;
157 }
158 }
159
160 } // namespace stream_executor
161