1 /* Copyright 2016 The TensorFlow Authors All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/profiler/internal/tfprof_node.h"
17
18 #include "tensorflow/core/profiler/internal/tfprof_utils.h"
19
20 namespace tensorflow {
21 namespace tfprof {
CountAsAcceleratorTime(const string & device)22 bool CountAsAcceleratorTime(const string& device) {
23 return device.find("stream:all") != device.npos;
24 }
CountAsCPUTime(const string & device)25 bool CountAsCPUTime(const string& device) {
26 return RE2::FullMatch(device, ".*/(device:gpu|gpu|device:cpu|cpu):\\d+");
27 }
IsCanonicalDevice(const string & device)28 bool IsCanonicalDevice(const string& device) { return CountAsCPUTime(device); }
29
30 // Notes about start and end time from the NodeExecStats proto:
31 // For GPU, there is no difference between op_end_rel_micros and
32 // all_end_rel_micros. All are kernel times.
33 // For CPU, op_end_rel is the kernel time, while all_end_rel_micros includes
34 // some post-processing. Besides, currently, there is no way to measure
35 // the execution time of async ops accurately.
36 //
37 // Notes about device:
38 // For ops on gpu:
39 // It will appear in three different devices in RunMetadata: 1) gpu:x,
40 // 2) gpu:x:stream:all and 3) gpu:x:stream:id. 2) is used a combined view
41 // of all different 3). 1) is the op scheduling, pre-processing and
42 // post processing time. 3) is the execution time of GPU kernels on a stream.
43 // For ops on cpu:
44 // It will only appear as cpu:0.
45
AddTimeStats(const string & dev,const NodeExecStats & step_stat)46 void ExecStep::AddTimeStats(const string& dev, const NodeExecStats& step_stat) {
47 devices_.insert(dev);
48 if (step_stat.all_start_micros() > 0) {
49 if (exec_.all_start_micros() > 0) {
50 exec_.set_all_start_micros(
51 std::min(static_cast<int64_t>(exec_.all_start_micros()),
52 static_cast<int64_t>(step_stat.all_start_micros())));
53 } else {
54 exec_.set_all_start_micros(step_stat.all_start_micros());
55 }
56 int64_t op_end_rel_micros = step_stat.op_end_rel_micros();
57 // Round quick execution to 1 micro to be semantically robust.
58 if (op_end_rel_micros == 0) {
59 ++op_end_rel_micros;
60 }
61 exec_.set_latest_end_micros(
62 std::max(static_cast<int64_t>(exec_.latest_end_micros()),
63 step_stat.all_start_micros() + op_end_rel_micros));
64
65 const std::pair<int64_t, int64_t> pair =
66 std::make_pair(step_stat.all_start_micros(), op_end_rel_micros);
67 if (CountAsAcceleratorTime(dev)) {
68 accelerator_execs_[dev].push_back(pair);
69 op_execs_[dev].push_back(pair);
70 } else if (CountAsCPUTime(dev)) {
71 cpu_execs_[dev].push_back(pair);
72 op_execs_[dev].push_back(pair);
73 // In while-loop, a graph node is executed multiple times under
74 // the same name.
75 exec_.set_run_count(exec_.run_count() + 1);
76 }
77 }
78 }
79
AddMemoryStats(const string & dev,const NodeExecStats & step_stat)80 void ExecStep::AddMemoryStats(const string& dev,
81 const NodeExecStats& step_stat) {
82 ExecMemory exec_mem;
83 if (step_stat.all_start_micros() > 0) {
84 exec_mem.set_memory_micros(step_stat.all_start_micros() +
85 step_stat.op_end_rel_micros());
86 } else {
87 absl::FPrintF(stderr, "%s has no start time, skipping\n",
88 step_stat.node_name());
89 return;
90 }
91
92 int accelerator_allocator_cnt = 0;
93 for (const auto& mem : step_stat.memory()) {
94 // TODO(xpan): Fix this hack. Currently the allocator name seems quite
95 // ad-hoc.
96 if (mem.allocator_name().find("GPU") == mem.allocator_name().npos) {
97 continue;
98 }
99 ++accelerator_allocator_cnt;
100 exec_mem.set_allocator_bytes_in_use(
101 std::max(static_cast<int64_t>(exec_mem.allocator_bytes_in_use()),
102 static_cast<int64_t>(mem.allocator_bytes_in_use())));
103 for (const auto& alloc : mem.allocation_records()) {
104 allocations_.push_back(alloc);
105 }
106 }
107 if (accelerator_allocator_cnt > 1) {
108 absl::FPrintF(stderr, "found %d gpu allocator for 1 node\n",
109 accelerator_allocator_cnt);
110 }
111
112 int64_t total_output_bytes = 0;
113 for (const auto& output : step_stat.output()) {
114 if (output.has_tensor_description() &&
115 output.tensor_description().has_allocation_description()) {
116 // TODO(xpan): Maybe allocated_bytes.
117 int64_t output_bytes = std::max(output.tensor_description()
118 .allocation_description()
119 .allocated_bytes(),
120 output.tensor_description()
121 .allocation_description()
122 .requested_bytes());
123 uint64 output_ptr =
124 output.tensor_description().allocation_description().ptr();
125 total_output_bytes += output_bytes;
126
127 auto& mem = (*exec_mem.mutable_output_memory())[output.slot()];
128 mem.set_ptr(output_ptr);
129 mem.set_bytes(output_bytes);
130 }
131 }
132 exec_mem.set_output_bytes(total_output_bytes);
133
134 if (step_stat.has_memory_stats()) {
135 if (IsPlacedOnCPU(dev)) {
136 // Currently we assume ops placed on gpu only allocate memory on gpu.
137 exec_mem.set_host_temp_bytes(exec_mem.host_temp_bytes() +
138 step_stat.memory_stats().temp_memory_size());
139 exec_mem.set_host_persistent_bytes(
140 exec_mem.host_persistent_bytes() +
141 step_stat.memory_stats().persistent_memory_size());
142 } else {
143 exec_mem.set_accelerator_temp_bytes(
144 exec_mem.accelerator_temp_bytes() +
145 step_stat.memory_stats().temp_memory_size());
146 exec_mem.set_accelerator_persistent_bytes(
147 exec_mem.accelerator_persistent_bytes() +
148 step_stat.memory_stats().persistent_memory_size());
149 }
150 }
151
152 // TODO(xpan): Make this more accurate:
153 // High level: Memory tracking is suspicious and requires large scale
154 // clean up.
155 // Investigate the memory usage difference between CPU/GPU with OpViewTest.
156 //
157 // 1. OpKernelConstruction::allocate_xxx is not traced. Below, we only
158 // discuss OpKernelContext-related allocations.
159 // 2. allocate_output calls allocate_tensor, which is properly tracked in
160 // 'NodeExecStats.memory'.
161 // 3. allocate_temp is only tracked through record_xxx_temp. It appears
162 // in 'NodeExecStats.memory_stats'.
163 // 4. record_xxx_persistent is called when allocate_persistent
164 // is not used and hence tracks some complementary bytes. It appears in
165 // 'NodeExecStats.memory_stats'. It's suspicious. But we should
166 // use it now since it covers constant op.
167 int64_t residual_bytes = 0;
168 int64_t requested_bytes = 0;
169 int64_t peak_bytes = 0;
170 for (const auto& mem : step_stat.memory()) {
171 residual_bytes += mem.live_bytes();
172 requested_bytes += mem.total_bytes();
173 peak_bytes += mem.peak_bytes();
174 }
175 residual_bytes += exec_mem.host_persistent_bytes() +
176 exec_mem.accelerator_persistent_bytes();
177 requested_bytes += exec_mem.host_persistent_bytes() +
178 exec_mem.accelerator_persistent_bytes() +
179 exec_mem.host_temp_bytes() +
180 exec_mem.accelerator_temp_bytes();
181 peak_bytes += exec_mem.host_persistent_bytes() +
182 exec_mem.accelerator_persistent_bytes() +
183 exec_mem.host_temp_bytes() + exec_mem.accelerator_temp_bytes();
184
185 exec_mem.set_requested_bytes(requested_bytes);
186 exec_mem.set_residual_bytes(residual_bytes);
187 exec_mem.set_peak_bytes(peak_bytes);
188 memory_execs_.emplace_back(exec_mem);
189 }
190
AddStepStat(int64_t step,const string & device,const NodeExecStats & step_stat)191 void TFGraphNode::AddStepStat(int64_t step, const string& device,
192 const NodeExecStats& step_stat) {
193 string dev = absl::AsciiStrToLower(device);
194
195 // TODO(xpan): Make this more robust?
196 // See run_metadata_test.py
197 // It can be /job:0/replica:0/xxxx/device:GPU:0, or simply /device:GPU:0.
198 // It can has some ad-hoc suffix, such as /stream:xx or /memcpy:xx.
199 if (IsCanonicalDevice(dev)) {
200 if (!node_.canonical_device().empty()) {
201 if (node_.canonical_device() != dev) {
202 // TODO(xpan): Some RunMetadata node appears at multiple devices.
203 // Need to address it.
204 return;
205 }
206 } else {
207 node_.set_canonical_device(dev);
208 // TODO(xpan): Support things other than gpu?
209 node_.set_host_device(StringReplace(dev, "gpu:\\d+", "cpu:0"));
210 AddOpType(node_.canonical_device());
211 }
212 }
213
214 auto exec = execs_.find(step);
215 if (exec == execs_.end()) {
216 execs_.insert(std::pair<int64_t, ExecStep>(step, ExecStep()));
217 exec = execs_.find(step);
218 }
219
220 exec->second.AddTimeStats(dev, step_stat);
221
222 if (dev == node_.canonical_device()) {
223 exec->second.AddMemoryStats(dev, step_stat);
224 }
225 }
226
exec_micros() const227 int64_t ExecStep::exec_micros() const {
228 return accelerator_exec_micros() + cpu_exec_micros();
229 }
230
accelerator_exec_micros() const231 int64_t ExecStep::accelerator_exec_micros() const {
232 int64_t total = 0;
233 // Normally, an op should only be scheduled on 1 accelerator device.
234 // Hence there should generally be 1 element in accelerator_execs_.
235 for (const auto& execs : accelerator_execs_) {
236 // An op can fire multiple kernels or
237 // being scheduled multiple times in while-loop.
238 for (const auto& exec : execs.second) {
239 total += exec.second;
240 }
241 }
242 return total;
243 }
244
cpu_exec_micros() const245 int64_t ExecStep::cpu_exec_micros() const {
246 int64_t total = 0;
247 // Normally, an op can only be scheduled on 1 device.
248 for (const auto& execs : cpu_execs_) {
249 // An op can be scheduled multiple times in while-loop.
250 for (const auto& exec : execs.second) {
251 total += exec.second;
252 }
253 }
254 return total;
255 }
256
ShapeProtoToVec(const TensorShapeProto & shape_pb)257 std::vector<int64_t> ShapeProtoToVec(const TensorShapeProto& shape_pb) {
258 std::vector<int64_t> shape_vec;
259 if (shape_pb.dim_size() == 0 && !shape_pb.unknown_rank()) {
260 // Scalar parameter with empty shape but known rank.
261 shape_vec.push_back(1);
262 } else {
263 for (const auto& d : shape_pb.dim()) {
264 shape_vec.push_back(d.size());
265 }
266 }
267 return shape_vec;
268 }
269
VecToShapeProto(const std::vector<int64_t> & shape_vec)270 TensorShapeProto VecToShapeProto(const std::vector<int64_t>& shape_vec) {
271 TensorShapeProto shape_pb;
272 if (shape_vec.empty()) {
273 shape_pb.set_unknown_rank(true);
274 return shape_pb;
275 }
276 for (const int64_t s : shape_vec) {
277 shape_pb.add_dim()->set_size(s);
278 }
279 return shape_pb;
280 }
281
IsPlacedOnAccelerator(const string & device)282 bool IsPlacedOnAccelerator(const string& device) {
283 return device.find("gpu") != device.npos;
284 }
IsPlacedOnCPU(const string & device)285 bool IsPlacedOnCPU(const string& device) {
286 return device.find("cpu") != device.npos;
287 }
288 } // namespace tfprof
289 } // namespace tensorflow
290