1 /* Copyright 2016 The TensorFlow Authors All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/profiler/internal/tfprof_timeline.h"
17
18 #include <utility>
19
20 #include "absl/strings/str_cat.h"
21 #include "absl/strings/str_format.h"
22 #include "tensorflow/core/lib/core/status.h"
23 #include "tensorflow/core/profiler/internal/tfprof_utils.h"
24
25 namespace tensorflow {
26 namespace tfprof {
27 namespace {
28 int kMaxDisplayedMemNode = 10;
29
GetTimeDevName(const std::string & dev)30 std::string GetTimeDevName(const std::string& dev) {
31 if (dev.find("stream") != dev.npos) {
32 return absl::StrCat("Op execution threads: ", dev);
33 } else {
34 return absl::StrCat("Op scheduling threads: ", dev);
35 }
36 }
GetMemoryLaneName(const std::string & dev)37 std::string GetMemoryLaneName(const std::string& dev) {
38 return absl::StrCat("mem usage on:", dev);
39 }
40 } // namespace
41
CreateEvent(const string & ph,const string & category,const string & name,int64_t pid,int64_t tid,int64_t ts)42 Json::Value ChromeTraceFormatter::CreateEvent(const string& ph,
43 const string& category,
44 const string& name, int64_t pid,
45 int64_t tid, int64_t ts) {
46 Json::Value event(Json::objectValue);
47 event["ph"] = Json::Value(ph);
48 event["cat"] = Json::Value(category);
49 event["name"] = Json::Value(name);
50 event["pid"] = Json::Int64(pid);
51 event["tid"] = Json::Int64(tid);
52 event["ts"] = Json::Int64(ts);
53 return event;
54 }
55
EmitPID(const string & name,int64_t pid)56 void ChromeTraceFormatter::EmitPID(const string& name, int64_t pid) {
57 Json::Value event(Json::objectValue);
58 event["name"] = Json::Value("process_name");
59 event["ph"] = Json::Value("M");
60 event["pid"] = Json::Int64(pid);
61 Json::Value args(Json::objectValue);
62 args["name"] = Json::Value(name);
63 event["args"] = args;
64 metadata_.push_back(event);
65 }
66
EmitRegion(int64_t ts,int64_t duration,int64_t pid,int64_t tid,const string & category,const string & name,Json::Value args)67 void ChromeTraceFormatter::EmitRegion(int64_t ts, int64_t duration, int64_t pid,
68 int64_t tid, const string& category,
69 const string& name, Json::Value args) {
70 Json::Value event = CreateEvent("X", category, name, pid, tid, ts);
71 event["dur"] = Json::Int64(duration);
72 event["args"] = std::move(args);
73 metadata_.push_back(event);
74 }
75
EmitFlowStart(const string & name,int64_t ts,int64_t pid,int64_t tid,int64_t flow_id)76 void ChromeTraceFormatter::EmitFlowStart(const string& name, int64_t ts,
77 int64_t pid, int64_t tid,
78 int64_t flow_id) {
79 Json::Value event = CreateEvent("s", "DataFlow", name, pid, tid, ts);
80 event["id"] = Json::Int64(flow_id);
81 events_.push_back(event);
82 }
83
EmitFlowEnd(const string & name,int64_t ts,int64_t pid,int64_t tid,int64_t flow_id)84 void ChromeTraceFormatter::EmitFlowEnd(const string& name, int64_t ts,
85 int64_t pid, int64_t tid,
86 int64_t flow_id) {
87 Json::Value event = CreateEvent("t", "DataFlow", name, pid, tid, ts);
88 event["id"] = Json::Int64(flow_id);
89 events_.push_back(event);
90 }
91
EmitCounter(const string & category,const string & name,int64_t pid,int64_t ts,const string & device,int64_t bytes,const std::map<int64,std::vector<string>> & tensor_mem)92 void ChromeTraceFormatter::EmitCounter(
93 const string& category, const string& name, int64_t pid, int64_t ts,
94 const string& device, int64_t bytes,
95 const std::map<int64, std::vector<string>>& tensor_mem) {
96 Json::Value event = CreateEvent("C", category, "Allocated Bytes", pid, 0, ts);
97 Json::Value args(Json::objectValue);
98 args["Allocator Bytes in Use"] = Json::Int64(bytes);
99 event["args"] = args;
100 events_.push_back(event);
101
102 // TODO(xpan): chrome://tracing is not ideal visualization for memory.
103 // It would be great to have a customized UI for it.
104 Json::Value event2 =
105 CreateEvent("C", category, "Top Allocations", pid + 1, 0, ts);
106 Json::Value args2(Json::objectValue);
107 // Need to reserve the same args for all locations.
108 for (int i = 1; i < kMaxDisplayedMemNode; ++i) {
109 args2[absl::StrFormat("Top Allocation %02d", i)] = Json::Value("N/A");
110 }
111 int count = 0;
112 for (auto it = tensor_mem.rbegin(); it != tensor_mem.rend(); ++it) {
113 for (const string& t : it->second) {
114 if (bytes < it->first || count >= kMaxDisplayedMemNode) {
115 break;
116 }
117 args2[absl::StrFormat("Top Allocation %02d", count)] =
118 Json::Value(absl::StrCat(it->first / 1000000.0, " MB from ", t));
119 ++count;
120 bytes -= it->first;
121 }
122 }
123 args2[std::string("Not Displayed")] =
124 Json::Value(absl::StrFormat("%.2f MB", bytes / 1000000.0));
125 event2["args"] = args2;
126 events_.push_back(event2);
127 }
128
Format()129 string ChromeTraceFormatter::Format() {
130 Json::Value trace;
131 trace["traceEvents"] = Json::Value(Json::arrayValue);
132 for (const Json::Value& v : metadata_) {
133 trace["traceEvents"].append(v);
134 }
135 for (const Json::Value& v : events_) {
136 trace["traceEvents"].append(v);
137 }
138 Json::FastWriter writer;
139 string trace_str = writer.write(trace);
140 if (trace_str.length() > 200 * 1024 * 1024) {
141 absl::FPrintF(stderr,
142 "Trace file is over 200MB. Chrome might not be able to "
143 "display it. Consider to use filters (e.g. -min_micros "
144 "> 1000 or -op_type .*gpu:0.* to reduce the size.\n");
145 }
146 return trace_str;
147 }
148
TrackNode(int64_t step,const GraphNode * node)149 void MemoryTracker::TrackNode(int64_t step, const GraphNode* node) {
150 if (!node->Trackable(step)) {
151 return;
152 }
153
154 Device& dev = devices_[node->node->canonical_device()];
155
156 std::map<int64, int64> allocs;
157 for (const auto& alloc : node->node->allocations(step)) {
158 allocs[alloc.alloc_micros()] += alloc.alloc_bytes();
159 dev.tracked_allocations[alloc.alloc_micros()] += alloc.alloc_bytes();
160 }
161 dev.tracked_allocations[0] += node->node->accelerator_persistent_bytes();
162 allocs[0] += node->node->accelerator_persistent_bytes();
163
164 int64_t last = 0;
165 std::map<int64, int64>& aggregate_allocs = dev.tensor_allocs[node->name()];
166 for (auto it = allocs.begin(); it != allocs.end(); ++it) {
167 last += it->second;
168 aggregate_allocs[it->first] = last;
169 }
170 for (const auto& bytes_in_use : node->node->allocator_bytes_in_use(step)) {
171 if (bytes_in_use.first <= 0) continue;
172 dev.allocations[bytes_in_use.first] = bytes_in_use.second;
173 }
174 }
175
AllocateTimeNodes(GraphNode * gnode)176 void Timeline::AllocateTimeNodes(GraphNode* gnode) {
177 if (gnode->Trackable(step_)) {
178 TrackNode(gnode);
179 const TFGraphNode* node = gnode->node;
180 for (const auto& kernel_execs : node->op_execs(step_)) {
181 const string& device = kernel_execs.first;
182
183 if (process_.find(device) == process_.end()) {
184 int64_t pid = AllocatePID();
185 process_[device].reset(new Process(device, pid));
186 chrome_formatter_.EmitPID(GetTimeDevName(device), pid);
187 }
188 Process* p = process_[device].get();
189
190 for (const auto& exec : kernel_execs.second) {
191 int64_t start_micros = exec.first;
192 int64_t exec_micros = exec.second;
193 // TODO(xpan): There might be start time duplication here.
194 if (tnodes_[device].find(start_micros) == tnodes_[device].end()) {
195 // TODO(xpan): Give each kernel call a unique_name.
196 tnodes_[device][start_micros].reset(
197 new TimeNode(p, gnode, start_micros, exec_micros));
198 }
199 }
200 }
201 }
202 for (GraphNode* n : gnode->show_children) {
203 AllocateTimeNodes(n);
204 }
205 }
206
GenerateGraphTimeline(const std::vector<GraphNode * > & gnodes)207 void Timeline::GenerateGraphTimeline(const std::vector<GraphNode*>& gnodes) {
208 for (GraphNode* gnode : gnodes) {
209 AllocateTimeNodes(gnode);
210 }
211 // To save memory, we only track cross-device (canonical device) flows.
212 for (auto& process : tnodes_) {
213 if (!IsCanonicalDevice(process.first)) continue;
214 for (auto& tn : process.second) {
215 TimeNode* tnode = tn.second.get();
216 for (GraphNode* inp : tnode->node->children) {
217 if (!inp->account || !inp->Trackable(step_)) {
218 continue;
219 }
220 for (const auto& execs : inp->node->cpu_execs(step_)) {
221 if (!IsCanonicalDevice(execs.first)) continue;
222 if (process.first == execs.first) {
223 // Not interested in flow within the same device.
224 continue;
225 }
226 for (const auto& exec : execs.second) {
227 int64_t start_micros = exec.first;
228 auto cprocess = tnodes_.find(execs.first);
229 if (cprocess == tnodes_.end()) continue;
230 auto ctn = cprocess->second.find(start_micros);
231 if (ctn == cprocess->second.end()) continue;
232 ctn->second->next_tnodes.push_back(tnode);
233 }
234 }
235 }
236 }
237 }
238
239 AllocateLanes();
240 absl::FPrintF(stdout, "generating trace file.\n");
241 int64_t flow_id = 1;
242 for (const auto& process : alloc_nodes_) {
243 for (const auto& lane : process.second) {
244 for (const auto& node : lane.second) {
245 TimeNode* tnode = node.second;
246
247 Json::Value args(Json::objectValue);
248 args["name"] = Json::Value(tnode->name());
249 chrome_formatter_.EmitRegion(node.first, tnode->exec_micros,
250 process.first, lane.first, "Op",
251 tnode->name(), args);
252 // Flow is a directed arrow pointing from src to dst.
253 // TODO(xpan): Disable flow to reduce json file size for now. Need
254 // to think of a better way to make flow interpretable.
255 for (TimeNode* next_tnode : node.second->next_tnodes) {
256 chrome_formatter_.EmitFlowStart(
257 tnode->name() + "_flow", tnode->start_micros + tnode->exec_micros,
258 process.first, lane.first, flow_id);
259 chrome_formatter_.EmitFlowEnd(
260 tnode->name() + "_flow", next_tnode->start_micros,
261 next_tnode->process->pid, next_tnode->tid, flow_id);
262 flow_id += 1;
263 }
264 }
265 }
266 }
267 for (const auto& dev : mem_tracker_.devices()) {
268 if (IsPlacedOnCPU(dev.first)) {
269 // TODO(xpan): Maybe also support CPU allocator memory tracking.
270 continue;
271 }
272 int64_t pid = AllocatePID();
273 chrome_formatter_.EmitPID(GetMemoryLaneName(dev.first), pid);
274 int64_t pid2 = AllocatePID();
275 chrome_formatter_.EmitPID(GetMemoryLaneName(dev.first) + " allocations",
276 pid2);
277
278 const MemoryTracker::Device& device = dev.second;
279
280 int64_t max_bytes_in_use = 0;
281 int64_t cur_bytes_in_use = 0;
282 int64_t last_point = 0;
283 for (const auto& alloc : device.allocations) {
284 cur_bytes_in_use = alloc.second;
285 max_bytes_in_use = std::max(max_bytes_in_use, cur_bytes_in_use);
286 // Do not plot too dense to reduce file size.
287 int64_t ts = alloc.first;
288 if (ts - last_point < 100) continue;
289 last_point = ts;
290
291 std::map<int64, std::vector<string>> tensor_mem;
292 for (const auto& tensor_alloc_it : dev.second.tensor_allocs) {
293 const auto& tensor_alloc = tensor_alloc_it.second;
294 auto it = tensor_alloc.lower_bound(ts);
295 if (it != tensor_alloc.begin()) {
296 --it;
297 }
298 if (it->second > 0) {
299 tensor_mem[it->second].push_back(tensor_alloc_it.first);
300 }
301 }
302 chrome_formatter_.EmitCounter("Memory", "Memory Series", pid, ts,
303 dev.first, cur_bytes_in_use, tensor_mem);
304 }
305 if (IsPlacedOnAccelerator(dev.first)) {
306 absl::FPrintF(stdout, "%s peak memory: %.2f MB\n", dev.first,
307 max_bytes_in_use / 1000000.0);
308 }
309 }
310 OutputTimeline();
311 }
312
GenerateScopeTimeline(const ScopeNode * node)313 void Timeline::GenerateScopeTimeline(const ScopeNode* node) {
314 std::set<int64> visited_depth;
315 EmitTreeNode(node, 0, node->proto().total_exec_micros(), 0, &visited_depth);
316 OutputTimeline();
317 }
318
GenerateCodeTimeline(const CodeNode * node)319 void Timeline::GenerateCodeTimeline(const CodeNode* node) {
320 std::set<int64> visited_depth;
321 EmitTreeNode(node, 0, node->proto().total_exec_micros(), 0, &visited_depth);
322 OutputTimeline();
323 }
324
OutputTimeline()325 void Timeline::OutputTimeline() {
326 std::string outfile = absl::StrFormat("%s_%d", outfile_, step());
327 Status s =
328 WriteStringToFile(Env::Default(), outfile, chrome_formatter_.Format());
329 if (!s.ok()) {
330 absl::FPrintF(stderr, "Failed to write timeline file: %s\nError: %s\n",
331 outfile, s.ToString());
332 return;
333 }
334 absl::FPrintF(stdout,
335 "\n******************************************************\n");
336 absl::FPrintF(stdout,
337 "Timeline file is written to %s.\n"
338 "Open a Chrome browser, enter URL chrome://tracing and "
339 "load the timeline file.",
340 outfile);
341 absl::FPrintF(stdout,
342 "\n******************************************************\n");
343 fflush(stdout);
344 }
345
AllocateLanes()346 void Timeline::AllocateLanes() {
347 for (auto& process : tnodes_) {
348 Process* p = process_[process.first].get();
349 for (auto& tnode : process.second) {
350 int64_t start_time = tnode.second->start_micros;
351 int64_t end_time = tnode.second->start_micros + tnode.second->exec_micros;
352 int64_t l = -1;
353 for (int64_t i = 0, end = p->lanes.size(); i < end; ++i) {
354 const auto& lane = p->lanes[i];
355 l = i;
356 for (auto cur_it = lane.rbegin(); cur_it != lane.rend(); ++cur_it) {
357 if (cur_it->second > start_time) {
358 l = -1;
359 break;
360 }
361 if (start_time > cur_it->second) {
362 break;
363 }
364 }
365 if (l >= 0) {
366 break;
367 }
368 }
369 if (l < 0) {
370 l = p->lanes.size();
371 std::map<int64, int64> nlane;
372 nlane[start_time] = end_time;
373 p->lanes.push_back(nlane);
374 } else {
375 p->lanes[l][start_time] = end_time;
376 }
377 tnode.second->tid = l;
378 alloc_nodes_[p->pid][l][start_time] = tnode.second.get();
379 }
380 }
381 }
382
AllocatePID()383 int64 Timeline::AllocatePID() {
384 int64_t cur_pid = next_pid_;
385 next_pid_ += 1;
386 return cur_pid;
387 }
388
389 } // namespace tfprof
390 } // namespace tensorflow
391