1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/stream_executor/device_description.h"
17
18 #include <algorithm>
19
20 #include "absl/strings/str_cat.h"
21 #include "tensorflow/stream_executor/lib/human_readable.h"
22 #include "tensorflow/stream_executor/lib/mathutil.h"
23
24 namespace stream_executor {
25
26 static const uint64 kUninitializedUint64 = -1ULL;
27 /* static */ const char *DeviceDescription::kUndefinedString = "<undefined>";
28
DeviceDescription()29 DeviceDescription::DeviceDescription()
30 : device_vendor_(kUndefinedString),
31 platform_version_(kUndefinedString),
32 driver_version_(kUndefinedString),
33 runtime_version_(kUndefinedString),
34 pci_bus_id_(kUndefinedString),
35 name_(kUndefinedString),
36 thread_dim_limit_(kUninitializedUint64, kUninitializedUint64,
37 kUninitializedUint64),
38 block_dim_limit_(kUninitializedUint64, kUninitializedUint64,
39 kUninitializedUint64),
40 threads_per_core_limit_(kUninitializedUint64),
41 threads_per_block_limit_(kUninitializedUint64),
42 threads_per_warp_(kUninitializedUint64),
43 registers_per_core_limit_(kUninitializedUint64),
44 registers_per_block_limit_(kUninitializedUint64),
45 device_address_bits_(kUninitializedUint64),
46 device_memory_size_(kUninitializedUint64),
47 memory_bandwidth_(kUninitializedUint64),
48 shared_memory_per_core_(kUninitializedUint64),
49 shared_memory_per_block_(kUninitializedUint64),
50 clock_rate_ghz_(-1.0),
51 cuda_compute_capability_major_(-1),
52 cuda_compute_capability_minor_(-1),
53 rocm_amdgpu_isa_version_(-1),
54 numa_node_(-1),
55 core_count_(-1),
56 ecc_enabled_(false) {}
57
ToMap() const58 std::unique_ptr<std::map<string, string>> DeviceDescription::ToMap() const {
59 std::unique_ptr<std::map<string, string>> owned_result{
60 new std::map<string, string>};
61 std::map<string, string> &result = *owned_result;
62 result["Device Vendor"] = device_vendor();
63 result["Platform Version"] = platform_version();
64 result["Driver Version"] = driver_version();
65 result["Runtime Version"] = runtime_version();
66 result["PCI bus ID"] = pci_bus_id_;
67 result["Device Name"] = name_;
68
69 const ThreadDim &thread_dim = thread_dim_limit();
70 result["ThreadDim Limit"] =
71 absl::StrCat(thread_dim.x, ",", thread_dim.y, ",", thread_dim.z);
72 const BlockDim &block_dim = block_dim_limit();
73 result["BlockDim Limit"] =
74 absl::StrCat(block_dim.x, ",", block_dim.y, ",", block_dim.z);
75
76 result["Threads Per Core Limit"] = absl::StrCat(threads_per_core_limit());
77 result["Threads Per Block Limit"] = absl::StrCat(threads_per_block_limit());
78 result["Registers Per Block Limit"] =
79 absl::StrCat(registers_per_block_limit());
80
81 result["Device Address Bits"] = absl::StrCat(device_address_bits());
82 result["Device Memory Size"] =
83 port::HumanReadableNumBytes::ToString(device_memory_size());
84 result["Memory Bandwidth"] = absl::StrCat(
85 port::HumanReadableNumBytes::ToString(memory_bandwidth_), "/s");
86
87 result["Shared Memory Per Core"] =
88 port::HumanReadableNumBytes::ToString(shared_memory_per_core_);
89 result["Shared Memory Per Block"] =
90 port::HumanReadableNumBytes::ToString(shared_memory_per_block_);
91
92 result["Clock Rate GHz"] = absl::StrCat(clock_rate_ghz());
93
94 result["CUDA Compute Capability"] = absl::StrCat(
95 cuda_compute_capability_major_, ".", cuda_compute_capability_minor_);
96
97 result["NUMA Node"] = absl::StrCat(numa_node());
98 result["Core Count"] = absl::StrCat(core_count());
99 result["ECC Enabled"] = absl::StrCat(ecc_enabled());
100 return owned_result;
101 }
102
103 namespace internal {
104
DeviceDescriptionBuilder()105 DeviceDescriptionBuilder::DeviceDescriptionBuilder()
106 : device_description_(new DeviceDescription) {}
107
108 } // namespace internal
109
cuda_compute_capability(int * major,int * minor) const110 bool DeviceDescription::cuda_compute_capability(int *major, int *minor) const {
111 *major = cuda_compute_capability_major_;
112 *minor = cuda_compute_capability_minor_;
113 return cuda_compute_capability_major_ != 0;
114 }
115
rocm_amdgpu_isa_version(int * version) const116 bool DeviceDescription::rocm_amdgpu_isa_version(int *version) const {
117 bool status = false;
118 if (rocm_amdgpu_isa_version_ > 0) {
119 *version = rocm_amdgpu_isa_version_;
120 status = true;
121 }
122 return status;
123 }
124
ThreadDimOk(const DeviceDescription & device_description,const ThreadDim & thread_dim)125 bool ThreadDimOk(const DeviceDescription &device_description,
126 const ThreadDim &thread_dim) {
127 auto total_threads = thread_dim.x * thread_dim.y * thread_dim.z;
128 auto threads_per_block_limit = device_description.threads_per_block_limit();
129 if (total_threads > threads_per_block_limit) {
130 VLOG(2) << "exceeded total-thread-per-block limit: " << total_threads
131 << " vs limit " << threads_per_block_limit;
132 return false;
133 }
134
135 const auto &limit = device_description.thread_dim_limit();
136 bool ok = thread_dim.x <= limit.x && thread_dim.y <= limit.y &&
137 thread_dim.z <= limit.z;
138 if (!ok) {
139 VLOG(2) << "thread dim " << thread_dim.ToString()
140 << " exceeds limit constraints of " << limit.ToString();
141 }
142 return ok;
143 }
144
DivideCeil(uint64 x,uint64 y)145 uint64 DivideCeil(uint64 x, uint64 y) {
146 return port::MathUtil::CeilOfRatio(x, y);
147 }
148
CalculateDimensionality(const DeviceDescription & device_description,int64 element_count,int64 * threads_per_block,int64 * block_count)149 void CalculateDimensionality(const DeviceDescription &device_description,
150 int64 element_count, int64 *threads_per_block,
151 int64 *block_count) {
152 *threads_per_block = device_description.threads_per_block_limit();
153 *block_count = port::MathUtil::CeilOfRatio(element_count, *threads_per_block);
154 if (*block_count == 1) {
155 CHECK_LE(element_count, *threads_per_block);
156 *threads_per_block = element_count;
157 }
158 }
159
160 } // namespace stream_executor
161