• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/stream_executor/device_description.h"
17 
18 #include <algorithm>
19 
20 #include "tensorflow/stream_executor/lib/human_readable.h"
21 #include "tensorflow/stream_executor/lib/mathutil.h"
22 #include "tensorflow/stream_executor/lib/strcat.h"
23 
24 namespace perftools {
25 namespace gputools {
26 
27 static const uint64 kUninitializedUint64 = -1ULL;
28 /* static */ const char *DeviceDescription::kUndefinedString = "<undefined>";
29 
DeviceDescription()30 DeviceDescription::DeviceDescription()
31     : device_vendor_(kUndefinedString),
32       platform_version_(kUndefinedString),
33       driver_version_(kUndefinedString),
34       runtime_version_(kUndefinedString),
35       pci_bus_id_(kUndefinedString),
36       name_(kUndefinedString),
37       thread_dim_limit_(kUninitializedUint64, kUninitializedUint64,
38                         kUninitializedUint64),
39       block_dim_limit_(kUninitializedUint64, kUninitializedUint64,
40                        kUninitializedUint64),
41       blocks_per_core_limit_(kUninitializedUint64),
42       threads_per_core_limit_(kUninitializedUint64),
43       threads_per_block_limit_(kUninitializedUint64),
44       threads_per_warp_(kUninitializedUint64),
45       registers_per_core_limit_(kUninitializedUint64),
46       registers_per_block_limit_(kUninitializedUint64),
47       registers_per_thread_limit_(kUninitializedUint64),
48       warp_alloc_granularity_(1),
49       register_alloc_granularity_(1),
50       shared_memory_alloc_granularity_(1),
51       device_address_bits_(kUninitializedUint64),
52       device_memory_size_(kUninitializedUint64),
53       shared_memory_per_core_(kUninitializedUint64),
54       shared_memory_per_block_(kUninitializedUint64),
55       clock_rate_ghz_(-1.0),
56       cuda_compute_capability_major_(-1),
57       cuda_compute_capability_minor_(-1),
58       numa_node_(-1),
59       core_count_(-1),
60       ecc_enabled_(false) {}
61 
ToMap() const62 std::unique_ptr<std::map<string, string>> DeviceDescription::ToMap() const {
63   std::unique_ptr<std::map<string, string>> owned_result{
64       new std::map<string, string>};
65   std::map<string, string> &result = *owned_result;
66   result["Device Vendor"] = device_vendor();
67   result["Platform Version"] = platform_version();
68   result["Driver Version"] = driver_version();
69   result["Runtime Version"] = runtime_version();
70   result["PCI bus ID"] = pci_bus_id_;
71   result["Device Name"] = name_;
72 
73   const ThreadDim &thread_dim = thread_dim_limit();
74   result["ThreadDim Limit"] =
75       port::StrCat(thread_dim.x, ",", thread_dim.y, ",", thread_dim.z);
76   const BlockDim &block_dim = block_dim_limit();
77   result["BlockDim Limit"] =
78       port::StrCat(block_dim.x, ",", block_dim.y, ",", block_dim.z);
79 
80   result["Threads Per Core Limit"] = port::StrCat(threads_per_core_limit());
81   result["Threads Per Block Limit"] = port::StrCat(threads_per_block_limit());
82   result["Registers Per Block Limit"] =
83       port::StrCat(registers_per_block_limit());
84 
85   result["Device Address Bits"] = port::StrCat(device_address_bits());
86   result["Device Memory Size"] =
87       port::HumanReadableNumBytes::ToString(device_memory_size());
88 
89   result["Shared Memory Per Core"] =
90       port::HumanReadableNumBytes::ToString(shared_memory_per_core_);
91   result["Shared Memory Per Block"] =
92       port::HumanReadableNumBytes::ToString(shared_memory_per_block_);
93 
94   result["Clock Rate GHz"] = port::StrCat(clock_rate_ghz());
95 
96   result["CUDA Compute Capability"] = port::StrCat(
97       cuda_compute_capability_major_, ".", cuda_compute_capability_minor_);
98 
99   result["NUMA Node"] = port::StrCat(numa_node());
100   result["Core Count"] = port::StrCat(core_count());
101   result["ECC Enabled"] = port::StrCat(ecc_enabled());
102   return owned_result;
103 }
104 
105 namespace internal {
106 
DeviceDescriptionBuilder()107 DeviceDescriptionBuilder::DeviceDescriptionBuilder()
108     : device_description_(new DeviceDescription) {}
109 
110 }  // namespace internal
111 
cuda_compute_capability(int * major,int * minor) const112 bool DeviceDescription::cuda_compute_capability(int *major, int *minor) const {
113   *major = cuda_compute_capability_major_;
114   *minor = cuda_compute_capability_minor_;
115   return cuda_compute_capability_major_ != 0;
116 }
117 
ThreadDimOk(const DeviceDescription & device_description,const ThreadDim & thread_dim)118 bool ThreadDimOk(const DeviceDescription &device_description,
119                  const ThreadDim &thread_dim) {
120   auto total_threads = thread_dim.x * thread_dim.y * thread_dim.z;
121   auto threads_per_block_limit = device_description.threads_per_block_limit();
122   if (total_threads > threads_per_block_limit) {
123     VLOG(2) << "exceeded total-thread-per-block limit: " << total_threads
124             << " vs limit " << threads_per_block_limit;
125     return false;
126   }
127 
128   const auto &limit = device_description.thread_dim_limit();
129   bool ok = thread_dim.x <= limit.x && thread_dim.y <= limit.y &&
130             thread_dim.z <= limit.z;
131   if (!ok) {
132     VLOG(2) << "thread dim " << thread_dim.ToString()
133             << " exceeds limit contraints of " << limit.ToString();
134   }
135   return ok;
136 }
137 
DivideCeil(uint64 x,uint64 y)138 uint64 DivideCeil(uint64 x, uint64 y) {
139   return port::MathUtil::CeilOfRatio(x, y);
140 }
141 
CalculateDimensionality(const DeviceDescription & device_description,uint64 element_count,uint64 * threads_per_block,uint64 * block_count)142 void CalculateDimensionality(const DeviceDescription &device_description,
143                              uint64 element_count, uint64 *threads_per_block,
144                              uint64 *block_count) {
145   *threads_per_block = device_description.threads_per_block_limit();
146   *block_count = DivideCeil(element_count, *threads_per_block);
147   if (*block_count == 1) {
148     CHECK_LE(element_count, *threads_per_block);
149     *threads_per_block = element_count;
150   }
151 }
152 
153 // Round value up to a multiple of n.
RoundUp(uint64 value,uint64 n)154 static uint64 RoundUp(uint64 value, uint64 n) {
155   return port::MathUtil::CeilOfRatio(value, n) * n;
156 }
157 
158 // Round value down to a multiple of n.
RoundDown(uint64 value,uint64 n)159 static uint64 RoundDown(uint64 value, uint64 n) {
160   return port::MathUtil::FloorOfRatio(value, n) * n;
161 }
162 
CalculateOccupancy(const DeviceDescription & device_description,uint64 registers_per_thread,uint64 shared_memory_per_block,const ThreadDim & thread_dims)163 uint64 CalculateOccupancy(const DeviceDescription &device_description,
164                           uint64 registers_per_thread,
165                           uint64 shared_memory_per_block,
166                           const ThreadDim &thread_dims) {
167   // Don't try to compute occupancy if necessary values are not initialized.
168   uint64 required_fields[] =  { device_description.registers_per_thread_limit(),
169                                 device_description.threads_per_warp(),
170                                 device_description.warp_alloc_granularity(),
171                                 device_description.register_alloc_granularity(),
172                                 device_description.registers_per_block_limit(),
173                                 device_description.shared_memory_per_core(),
174                                 device_description.blocks_per_core_limit() };
175   for (auto value : required_fields) {
176     if (value == kUninitializedUint64) {
177       return 0;
178     }
179   }
180 
181   if (registers_per_thread > device_description.registers_per_thread_limit()) {
182     return 0;
183   }
184 
185   uint64 warps_per_block =
186       port::MathUtil::CeilOfRatio(thread_dims.x * thread_dims.y * thread_dims.z,
187                                   device_description.threads_per_warp());
188 
189   // Warp resources are allocated at a particular granularity.  This value is
190   // the effective number of warps for resource allocation purposes.
191   uint64 alloc_warps_per_block =
192       RoundUp(warps_per_block, device_description.warp_alloc_granularity());
193 
194   uint64 alloc_regs_per_warp =
195       RoundUp(device_description.threads_per_warp() * registers_per_thread,
196               device_description.register_alloc_granularity());
197   uint64 regs_per_block = alloc_warps_per_block * alloc_regs_per_warp;
198   uint64 reg_limit =
199       device_description.registers_per_block_limit() / regs_per_block;
200 
201   uint64 alloc_smem_per_block = RoundUp(
202       shared_memory_per_block,
203       device_description.shared_memory_alloc_granularity());
204   uint64 smem_limit = alloc_smem_per_block > 0 ?
205       device_description.shared_memory_per_core() / alloc_smem_per_block :
206       device_description.blocks_per_core_limit();
207 
208   uint64 thread_limit = device_description.threads_per_core_limit()
209       / (warps_per_block  * device_description.threads_per_warp());
210 
211   return std::min({ device_description.blocks_per_core_limit(),
212           reg_limit, smem_limit, thread_limit });
213 }
214 
CalculateRegisterLimitForTargetOccupancy(const DeviceDescription & device_description,uint64 shared_memory_per_block,const ThreadDim & thread_dims,uint64 target_blocks_per_core)215 uint64 CalculateRegisterLimitForTargetOccupancy(
216     const DeviceDescription &device_description, uint64 shared_memory_per_block,
217     const ThreadDim &thread_dims, uint64 target_blocks_per_core) {
218   // Linear search from maximum number of registers down until the target
219   // blocks per SM is found.
220   // TODO(meheff): Compute this using a closed form solution.
221   int reg_step = device_description.register_alloc_granularity() /
222       device_description.threads_per_warp();
223   for (int r = device_description.registers_per_thread_limit(); r > 0;
224        r = RoundDown(r - 1, reg_step)) {
225     uint64 occupancy = CalculateOccupancy(
226         device_description, r, shared_memory_per_block, thread_dims);
227     if (occupancy >= target_blocks_per_core) {
228       return r;
229     }
230   }
231   return 0;
232 }
233 
234 
235 }  // namespace gputools
236 }  // namespace perftools
237