1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/stream_executor/device_description.h"
17
18 #include <algorithm>
19
20 #include "tensorflow/stream_executor/lib/human_readable.h"
21 #include "tensorflow/stream_executor/lib/mathutil.h"
22 #include "tensorflow/stream_executor/lib/strcat.h"
23
24 namespace perftools {
25 namespace gputools {
26
27 static const uint64 kUninitializedUint64 = -1ULL;
28 /* static */ const char *DeviceDescription::kUndefinedString = "<undefined>";
29
DeviceDescription()30 DeviceDescription::DeviceDescription()
31 : device_vendor_(kUndefinedString),
32 platform_version_(kUndefinedString),
33 driver_version_(kUndefinedString),
34 runtime_version_(kUndefinedString),
35 pci_bus_id_(kUndefinedString),
36 name_(kUndefinedString),
37 thread_dim_limit_(kUninitializedUint64, kUninitializedUint64,
38 kUninitializedUint64),
39 block_dim_limit_(kUninitializedUint64, kUninitializedUint64,
40 kUninitializedUint64),
41 blocks_per_core_limit_(kUninitializedUint64),
42 threads_per_core_limit_(kUninitializedUint64),
43 threads_per_block_limit_(kUninitializedUint64),
44 threads_per_warp_(kUninitializedUint64),
45 registers_per_core_limit_(kUninitializedUint64),
46 registers_per_block_limit_(kUninitializedUint64),
47 registers_per_thread_limit_(kUninitializedUint64),
48 warp_alloc_granularity_(1),
49 register_alloc_granularity_(1),
50 shared_memory_alloc_granularity_(1),
51 device_address_bits_(kUninitializedUint64),
52 device_memory_size_(kUninitializedUint64),
53 shared_memory_per_core_(kUninitializedUint64),
54 shared_memory_per_block_(kUninitializedUint64),
55 clock_rate_ghz_(-1.0),
56 cuda_compute_capability_major_(-1),
57 cuda_compute_capability_minor_(-1),
58 numa_node_(-1),
59 core_count_(-1),
60 ecc_enabled_(false) {}
61
ToMap() const62 std::unique_ptr<std::map<string, string>> DeviceDescription::ToMap() const {
63 std::unique_ptr<std::map<string, string>> owned_result{
64 new std::map<string, string>};
65 std::map<string, string> &result = *owned_result;
66 result["Device Vendor"] = device_vendor();
67 result["Platform Version"] = platform_version();
68 result["Driver Version"] = driver_version();
69 result["Runtime Version"] = runtime_version();
70 result["PCI bus ID"] = pci_bus_id_;
71 result["Device Name"] = name_;
72
73 const ThreadDim &thread_dim = thread_dim_limit();
74 result["ThreadDim Limit"] =
75 port::StrCat(thread_dim.x, ",", thread_dim.y, ",", thread_dim.z);
76 const BlockDim &block_dim = block_dim_limit();
77 result["BlockDim Limit"] =
78 port::StrCat(block_dim.x, ",", block_dim.y, ",", block_dim.z);
79
80 result["Threads Per Core Limit"] = port::StrCat(threads_per_core_limit());
81 result["Threads Per Block Limit"] = port::StrCat(threads_per_block_limit());
82 result["Registers Per Block Limit"] =
83 port::StrCat(registers_per_block_limit());
84
85 result["Device Address Bits"] = port::StrCat(device_address_bits());
86 result["Device Memory Size"] =
87 port::HumanReadableNumBytes::ToString(device_memory_size());
88
89 result["Shared Memory Per Core"] =
90 port::HumanReadableNumBytes::ToString(shared_memory_per_core_);
91 result["Shared Memory Per Block"] =
92 port::HumanReadableNumBytes::ToString(shared_memory_per_block_);
93
94 result["Clock Rate GHz"] = port::StrCat(clock_rate_ghz());
95
96 result["CUDA Compute Capability"] = port::StrCat(
97 cuda_compute_capability_major_, ".", cuda_compute_capability_minor_);
98
99 result["NUMA Node"] = port::StrCat(numa_node());
100 result["Core Count"] = port::StrCat(core_count());
101 result["ECC Enabled"] = port::StrCat(ecc_enabled());
102 return owned_result;
103 }
104
105 namespace internal {
106
DeviceDescriptionBuilder()107 DeviceDescriptionBuilder::DeviceDescriptionBuilder()
108 : device_description_(new DeviceDescription) {}
109
110 } // namespace internal
111
cuda_compute_capability(int * major,int * minor) const112 bool DeviceDescription::cuda_compute_capability(int *major, int *minor) const {
113 *major = cuda_compute_capability_major_;
114 *minor = cuda_compute_capability_minor_;
115 return cuda_compute_capability_major_ != 0;
116 }
117
ThreadDimOk(const DeviceDescription & device_description,const ThreadDim & thread_dim)118 bool ThreadDimOk(const DeviceDescription &device_description,
119 const ThreadDim &thread_dim) {
120 auto total_threads = thread_dim.x * thread_dim.y * thread_dim.z;
121 auto threads_per_block_limit = device_description.threads_per_block_limit();
122 if (total_threads > threads_per_block_limit) {
123 VLOG(2) << "exceeded total-thread-per-block limit: " << total_threads
124 << " vs limit " << threads_per_block_limit;
125 return false;
126 }
127
128 const auto &limit = device_description.thread_dim_limit();
129 bool ok = thread_dim.x <= limit.x && thread_dim.y <= limit.y &&
130 thread_dim.z <= limit.z;
131 if (!ok) {
132 VLOG(2) << "thread dim " << thread_dim.ToString()
133 << " exceeds limit contraints of " << limit.ToString();
134 }
135 return ok;
136 }
137
DivideCeil(uint64 x,uint64 y)138 uint64 DivideCeil(uint64 x, uint64 y) {
139 return port::MathUtil::CeilOfRatio(x, y);
140 }
141
CalculateDimensionality(const DeviceDescription & device_description,uint64 element_count,uint64 * threads_per_block,uint64 * block_count)142 void CalculateDimensionality(const DeviceDescription &device_description,
143 uint64 element_count, uint64 *threads_per_block,
144 uint64 *block_count) {
145 *threads_per_block = device_description.threads_per_block_limit();
146 *block_count = DivideCeil(element_count, *threads_per_block);
147 if (*block_count == 1) {
148 CHECK_LE(element_count, *threads_per_block);
149 *threads_per_block = element_count;
150 }
151 }
152
153 // Round value up to a multiple of n.
RoundUp(uint64 value,uint64 n)154 static uint64 RoundUp(uint64 value, uint64 n) {
155 return port::MathUtil::CeilOfRatio(value, n) * n;
156 }
157
158 // Round value down to a multiple of n.
RoundDown(uint64 value,uint64 n)159 static uint64 RoundDown(uint64 value, uint64 n) {
160 return port::MathUtil::FloorOfRatio(value, n) * n;
161 }
162
CalculateOccupancy(const DeviceDescription & device_description,uint64 registers_per_thread,uint64 shared_memory_per_block,const ThreadDim & thread_dims)163 uint64 CalculateOccupancy(const DeviceDescription &device_description,
164 uint64 registers_per_thread,
165 uint64 shared_memory_per_block,
166 const ThreadDim &thread_dims) {
167 // Don't try to compute occupancy if necessary values are not initialized.
168 uint64 required_fields[] = { device_description.registers_per_thread_limit(),
169 device_description.threads_per_warp(),
170 device_description.warp_alloc_granularity(),
171 device_description.register_alloc_granularity(),
172 device_description.registers_per_block_limit(),
173 device_description.shared_memory_per_core(),
174 device_description.blocks_per_core_limit() };
175 for (auto value : required_fields) {
176 if (value == kUninitializedUint64) {
177 return 0;
178 }
179 }
180
181 if (registers_per_thread > device_description.registers_per_thread_limit()) {
182 return 0;
183 }
184
185 uint64 warps_per_block =
186 port::MathUtil::CeilOfRatio(thread_dims.x * thread_dims.y * thread_dims.z,
187 device_description.threads_per_warp());
188
189 // Warp resources are allocated at a particular granularity. This value is
190 // the effective number of warps for resource allocation purposes.
191 uint64 alloc_warps_per_block =
192 RoundUp(warps_per_block, device_description.warp_alloc_granularity());
193
194 uint64 alloc_regs_per_warp =
195 RoundUp(device_description.threads_per_warp() * registers_per_thread,
196 device_description.register_alloc_granularity());
197 uint64 regs_per_block = alloc_warps_per_block * alloc_regs_per_warp;
198 uint64 reg_limit =
199 device_description.registers_per_block_limit() / regs_per_block;
200
201 uint64 alloc_smem_per_block = RoundUp(
202 shared_memory_per_block,
203 device_description.shared_memory_alloc_granularity());
204 uint64 smem_limit = alloc_smem_per_block > 0 ?
205 device_description.shared_memory_per_core() / alloc_smem_per_block :
206 device_description.blocks_per_core_limit();
207
208 uint64 thread_limit = device_description.threads_per_core_limit()
209 / (warps_per_block * device_description.threads_per_warp());
210
211 return std::min({ device_description.blocks_per_core_limit(),
212 reg_limit, smem_limit, thread_limit });
213 }
214
CalculateRegisterLimitForTargetOccupancy(const DeviceDescription & device_description,uint64 shared_memory_per_block,const ThreadDim & thread_dims,uint64 target_blocks_per_core)215 uint64 CalculateRegisterLimitForTargetOccupancy(
216 const DeviceDescription &device_description, uint64 shared_memory_per_block,
217 const ThreadDim &thread_dims, uint64 target_blocks_per_core) {
218 // Linear search from maximum number of registers down until the target
219 // blocks per SM is found.
220 // TODO(meheff): Compute this using a closed form solution.
221 int reg_step = device_description.register_alloc_granularity() /
222 device_description.threads_per_warp();
223 for (int r = device_description.registers_per_thread_limit(); r > 0;
224 r = RoundDown(r - 1, reg_step)) {
225 uint64 occupancy = CalculateOccupancy(
226 device_description, r, shared_memory_per_block, thread_dims);
227 if (occupancy >= target_blocks_per_core) {
228 return r;
229 }
230 }
231 return 0;
232 }
233
234
235 } // namespace gputools
236 } // namespace perftools
237