1 /* Copyright 2016 The TensorFlow Authors All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/profiler/internal/tfprof_node.h"
17
18 #include "tensorflow/core/profiler/internal/tfprof_utils.h"
19
20 namespace tensorflow {
21 namespace tfprof {
CountAsAcceleratorTime(const string & device)22 bool CountAsAcceleratorTime(const string& device) {
23 return device.find("stream:all") != device.npos;
24 }
CountAsCPUTime(const string & device)25 bool CountAsCPUTime(const string& device) {
26 return RE2::FullMatch(device,
27 ".*/(device:gpu|gpu|device:cpu|cpu|device:sycl):\\d+");
28 }
IsCanonicalDevice(const string & device)29 bool IsCanonicalDevice(const string& device) { return CountAsCPUTime(device); }
30
31 // Notes about start and end time from the NodeExecStats proto:
32 // For GPU, there is no difference between op_end_rel_micros and
33 // all_end_rel_micros. All are kernel times.
34 // For CPU, op_end_rel is the kernel time, while all_end_rel_micros includes
35 // some post-processing. Besides, currently, there is no way to measure
36 // the execution time of async ops accurately.
37 //
38 // Notes about device:
39 // For ops on gpu:
40 // It will appear in three different devices in RunMetadata: 1) gpu:x,
41 // 2) gpu:x:stream:all and 3) gpu:x:stream:id. 2) is used a combined view
42 // of all different 3). 1) is the op scheduling, pre-processing and
43 // post processing time. 3) is the execution time of GPU kernels on a stream.
44 // For ops on cpu:
45 // It will only appear as cpu:0.
46
AddTimeStats(const string & dev,const NodeExecStats & step_stat)47 void ExecStep::AddTimeStats(const string& dev, const NodeExecStats& step_stat) {
48 devices_.insert(dev);
49 if (step_stat.all_start_micros() > 0) {
50 if (exec_.all_start_micros() > 0) {
51 exec_.set_all_start_micros(
52 std::min(static_cast<int64>(exec_.all_start_micros()),
53 static_cast<int64>(step_stat.all_start_micros())));
54 } else {
55 exec_.set_all_start_micros(step_stat.all_start_micros());
56 }
57 int64 op_end_rel_micros = step_stat.op_end_rel_micros();
58 // Round quick execution to 1 micro to be semantically robust.
59 if (op_end_rel_micros == 0) {
60 ++op_end_rel_micros;
61 }
62 exec_.set_latest_end_micros(
63 std::max(static_cast<int64>(exec_.latest_end_micros()),
64 step_stat.all_start_micros() + op_end_rel_micros));
65
66 const std::pair<int64, int64> pair =
67 std::make_pair(step_stat.all_start_micros(), op_end_rel_micros);
68 if (CountAsAcceleratorTime(dev)) {
69 accelerator_execs_[dev].push_back(pair);
70 op_execs_[dev].push_back(pair);
71 } else if (CountAsCPUTime(dev)) {
72 cpu_execs_[dev].push_back(pair);
73 op_execs_[dev].push_back(pair);
74 // In while-loop, a graph node is executed multiple times under
75 // the same name.
76 exec_.set_run_count(exec_.run_count() + 1);
77 }
78 }
79 }
80
AddMemoryStats(const string & dev,const NodeExecStats & step_stat)81 void ExecStep::AddMemoryStats(const string& dev,
82 const NodeExecStats& step_stat) {
83 ExecMemory exec_mem;
84 if (step_stat.all_start_micros() > 0) {
85 exec_mem.set_memory_micros(step_stat.all_start_micros() +
86 step_stat.op_end_rel_micros());
87 } else {
88 fprintf(stderr, "%s has no start time, skipping\n",
89 step_stat.node_name().c_str());
90 return;
91 }
92
93 int accelerator_allocator_cnt = 0;
94 for (const auto& mem : step_stat.memory()) {
95 // TODO(xpan): Fix this hack. Currently the allocator name seems quite
96 // ad-hoc.
97 if (mem.allocator_name().find("GPU") == mem.allocator_name().npos) {
98 continue;
99 }
100 ++accelerator_allocator_cnt;
101 exec_mem.set_allocator_bytes_in_use(
102 std::max(static_cast<int64>(exec_mem.allocator_bytes_in_use()),
103 static_cast<int64>(mem.allocator_bytes_in_use())));
104 for (const auto& alloc : mem.allocation_records()) {
105 allocations_.push_back(alloc);
106 }
107 }
108 if (accelerator_allocator_cnt > 1) {
109 fprintf(stderr, "found %d gpu allocator for 1 node\n",
110 accelerator_allocator_cnt);
111 }
112
113 int64 total_output_bytes = 0;
114 for (const auto& output : step_stat.output()) {
115 if (output.has_tensor_description() &&
116 output.tensor_description().has_allocation_description()) {
117 // TODO(xpan): Maybe allocated_bytes.
118 int64 output_bytes = std::max(output.tensor_description()
119 .allocation_description()
120 .allocated_bytes(),
121 output.tensor_description()
122 .allocation_description()
123 .requested_bytes());
124 uint64 output_ptr =
125 output.tensor_description().allocation_description().ptr();
126 total_output_bytes += output_bytes;
127
128 auto& mem = (*exec_mem.mutable_output_memory())[output.slot()];
129 mem.set_ptr(output_ptr);
130 mem.set_bytes(output_bytes);
131 }
132 }
133 exec_mem.set_output_bytes(total_output_bytes);
134
135 if (step_stat.has_memory_stats()) {
136 if (IsPlacedOnCPU(dev)) {
137 // Currently we assume ops placed on gpu only allocate memory on gpu.
138 exec_mem.set_host_temp_bytes(exec_mem.host_temp_bytes() +
139 step_stat.memory_stats().temp_memory_size());
140 exec_mem.set_host_persistent_bytes(
141 exec_mem.host_persistent_bytes() +
142 step_stat.memory_stats().persistent_memory_size());
143 } else {
144 exec_mem.set_accelerator_temp_bytes(
145 exec_mem.accelerator_temp_bytes() +
146 step_stat.memory_stats().temp_memory_size());
147 exec_mem.set_accelerator_persistent_bytes(
148 exec_mem.accelerator_persistent_bytes() +
149 step_stat.memory_stats().persistent_memory_size());
150 }
151 }
152
153 // TODO(xpan): Make this more accurate:
154 // High level: Memory tracking is suspicious and requires large scale
155 // clean up.
156 // Investigte the memory usage difference between CPU/GPU with OpViewTest.
157 //
158 // 1. OpKernelConstruction::allocate_xxx is not traced. Below, we only
159 // discuss OpKernelContext-related allocations.
160 // 2. allocate_output calls allocate_tensor, which is properly tracked in
161 // 'NodeExecStats.memory'.
162 // 3. allocate_temp is only tracked through record_xxx_temp. It appears
163 // in 'NodeExecStats.memory_stats'.
164 // 4. allocate_persistent calls allocate_tensor, which is properly tracked
165 // in 'NodeExecStats.memory'. However, there is no way to count it as
166 // persistent now.
167 // 5. record_xxx_persistent is called when allocate_persistent
168 // is not used and hence tracks some complementary bytes. It appears in
169 // 'NodeExecStats.memory_stats'. It's suspicious. But we should
170 // use it now since it covers constant op.
171 int64 residual_bytes = 0;
172 int64 requested_bytes = 0;
173 int64 peak_bytes = 0;
174 for (const auto& mem : step_stat.memory()) {
175 residual_bytes += mem.live_bytes();
176 requested_bytes += mem.total_bytes();
177 peak_bytes += mem.peak_bytes();
178 }
179 residual_bytes += exec_mem.host_persistent_bytes() +
180 exec_mem.accelerator_persistent_bytes();
181 requested_bytes += exec_mem.host_persistent_bytes() +
182 exec_mem.accelerator_persistent_bytes() +
183 exec_mem.host_temp_bytes() +
184 exec_mem.accelerator_temp_bytes();
185 peak_bytes += exec_mem.host_persistent_bytes() +
186 exec_mem.accelerator_persistent_bytes() +
187 exec_mem.host_temp_bytes() + exec_mem.accelerator_temp_bytes();
188
189 exec_mem.set_requested_bytes(requested_bytes);
190 exec_mem.set_residual_bytes(residual_bytes);
191 exec_mem.set_peak_bytes(peak_bytes);
192 memory_execs_.emplace_back(exec_mem);
193 }
194
AddStepStat(int64 step,const string & device,const NodeExecStats & step_stat)195 void TFGraphNode::AddStepStat(int64 step, const string& device,
196 const NodeExecStats& step_stat) {
197 string dev = str_util::Lowercase(device);
198
199 // TODO(xpan): Make this more robust?
200 // See run_metadata_test.py
201 // It can be /job:0/replica:0/xxxx/device:GPU:0, or simply /device:GPU:0.
202 // It can has some ad-hoc suffix, such as /stream:xx or /memcpy:xx.
203 if (IsCanonicalDevice(dev)) {
204 if (!node_.canonical_device().empty()) {
205 if (node_.canonical_device() != dev) {
206 // TODO(xpan): Some RunMetadata node appears at multiple devices.
207 // Need to address it.
208 return;
209 }
210 } else {
211 node_.set_canonical_device(dev);
212 // TODO(xpan): Support things other than gpu?
213 if (dev.find("sycl") != dev.npos) {
214 node_.set_host_device(StringReplace(dev, "device:sycl:\\d+", "cpu:0"));
215 } else {
216 node_.set_host_device(StringReplace(dev, "gpu:\\d+", "cpu:0"));
217 }
218 AddOpType(node_.canonical_device());
219 }
220 }
221
222 auto exec = execs_.find(step);
223 if (exec == execs_.end()) {
224 execs_.insert(std::pair<int64, ExecStep>(step, ExecStep()));
225 exec = execs_.find(step);
226 }
227
228 exec->second.AddTimeStats(dev, step_stat);
229
230 if (dev == node_.canonical_device()) {
231 exec->second.AddMemoryStats(dev, step_stat);
232 }
233 }
234
exec_micros() const235 int64 ExecStep::exec_micros() const {
236 return accelerator_exec_micros() + cpu_exec_micros();
237 }
238
accelerator_exec_micros() const239 int64 ExecStep::accelerator_exec_micros() const {
240 int64 total = 0;
241 // Normally, an op should only be scheduled on 1 accelerator device.
242 // Hence there should generally be 1 element in accelerator_execs_.
243 for (const auto& execs : accelerator_execs_) {
244 // An op can fire multiple kernels or
245 // being scheduled multiple times in while-loop.
246 for (const auto& exec : execs.second) {
247 total += exec.second;
248 }
249 }
250 return total;
251 }
252
cpu_exec_micros() const253 int64 ExecStep::cpu_exec_micros() const {
254 int64 total = 0;
255 // Normally, an op can only be scheduled on 1 device.
256 for (const auto& execs : cpu_execs_) {
257 // An op can be scheduled multiple times in while-loop.
258 for (const auto& exec : execs.second) {
259 total += exec.second;
260 }
261 }
262 return total;
263 }
264
ShapeProtoToVec(const TensorShapeProto & shape_pb)265 std::vector<int64> ShapeProtoToVec(const TensorShapeProto& shape_pb) {
266 std::vector<int64> shape_vec;
267 if (shape_pb.dim_size() == 0 && !shape_pb.unknown_rank()) {
268 // Scalar parameter with empty shape but known rank.
269 shape_vec.push_back(1);
270 } else {
271 for (const auto& d : shape_pb.dim()) {
272 shape_vec.push_back(d.size());
273 }
274 }
275 return shape_vec;
276 }
277
VecToShapeProto(const std::vector<int64> & shape_vec)278 TensorShapeProto VecToShapeProto(const std::vector<int64>& shape_vec) {
279 TensorShapeProto shape_pb;
280 if (shape_vec.empty()) {
281 shape_pb.set_unknown_rank(true);
282 return shape_pb;
283 }
284 for (const int64 s : shape_vec) {
285 shape_pb.add_dim()->set_size(s);
286 }
287 return shape_pb;
288 }
289
IsPlacedOnAccelerator(const string & device)290 bool IsPlacedOnAccelerator(const string& device) {
291 return device.find("gpu") != device.npos ||
292 device.find("sycl") != device.npos;
293 }
IsPlacedOnCPU(const string & device)294 bool IsPlacedOnCPU(const string& device) {
295 return device.find("cpu") != device.npos;
296 }
297 } // namespace tfprof
298 } // namespace tensorflow
299