• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2016 The TensorFlow Authors All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/profiler/internal/tfprof_timeline.h"
17 
18 #include <utility>
19 
20 #include "absl/strings/str_cat.h"
21 #include "absl/strings/str_format.h"
22 #include "tensorflow/core/lib/core/status.h"
23 #include "tensorflow/core/profiler/internal/tfprof_utils.h"
24 
25 namespace tensorflow {
26 namespace tfprof {
27 namespace {
28 int kMaxDisplayedMemNode = 10;
29 
GetTimeDevName(const std::string & dev)30 std::string GetTimeDevName(const std::string& dev) {
31   if (dev.find("stream") != dev.npos) {
32     return absl::StrCat("Op execution threads: ", dev);
33   } else {
34     return absl::StrCat("Op scheduling threads: ", dev);
35   }
36 }
GetMemoryLaneName(const std::string & dev)37 std::string GetMemoryLaneName(const std::string& dev) {
38   return absl::StrCat("mem usage on:", dev);
39 }
40 }  // namespace
41 
CreateEvent(const string & ph,const string & category,const string & name,int64_t pid,int64_t tid,int64_t ts)42 Json::Value ChromeTraceFormatter::CreateEvent(const string& ph,
43                                               const string& category,
44                                               const string& name, int64_t pid,
45                                               int64_t tid, int64_t ts) {
46   Json::Value event(Json::objectValue);
47   event["ph"] = Json::Value(ph);
48   event["cat"] = Json::Value(category);
49   event["name"] = Json::Value(name);
50   event["pid"] = Json::Int64(pid);
51   event["tid"] = Json::Int64(tid);
52   event["ts"] = Json::Int64(ts);
53   return event;
54 }
55 
EmitPID(const string & name,int64_t pid)56 void ChromeTraceFormatter::EmitPID(const string& name, int64_t pid) {
57   Json::Value event(Json::objectValue);
58   event["name"] = Json::Value("process_name");
59   event["ph"] = Json::Value("M");
60   event["pid"] = Json::Int64(pid);
61   Json::Value args(Json::objectValue);
62   args["name"] = Json::Value(name);
63   event["args"] = args;
64   metadata_.push_back(event);
65 }
66 
EmitRegion(int64_t ts,int64_t duration,int64_t pid,int64_t tid,const string & category,const string & name,Json::Value args)67 void ChromeTraceFormatter::EmitRegion(int64_t ts, int64_t duration, int64_t pid,
68                                       int64_t tid, const string& category,
69                                       const string& name, Json::Value args) {
70   Json::Value event = CreateEvent("X", category, name, pid, tid, ts);
71   event["dur"] = Json::Int64(duration);
72   event["args"] = std::move(args);
73   metadata_.push_back(event);
74 }
75 
EmitFlowStart(const string & name,int64_t ts,int64_t pid,int64_t tid,int64_t flow_id)76 void ChromeTraceFormatter::EmitFlowStart(const string& name, int64_t ts,
77                                          int64_t pid, int64_t tid,
78                                          int64_t flow_id) {
79   Json::Value event = CreateEvent("s", "DataFlow", name, pid, tid, ts);
80   event["id"] = Json::Int64(flow_id);
81   events_.push_back(event);
82 }
83 
EmitFlowEnd(const string & name,int64_t ts,int64_t pid,int64_t tid,int64_t flow_id)84 void ChromeTraceFormatter::EmitFlowEnd(const string& name, int64_t ts,
85                                        int64_t pid, int64_t tid,
86                                        int64_t flow_id) {
87   Json::Value event = CreateEvent("t", "DataFlow", name, pid, tid, ts);
88   event["id"] = Json::Int64(flow_id);
89   events_.push_back(event);
90 }
91 
EmitCounter(const string & category,const string & name,int64_t pid,int64_t ts,const string & device,int64_t bytes,const std::map<int64,std::vector<string>> & tensor_mem)92 void ChromeTraceFormatter::EmitCounter(
93     const string& category, const string& name, int64_t pid, int64_t ts,
94     const string& device, int64_t bytes,
95     const std::map<int64, std::vector<string>>& tensor_mem) {
96   Json::Value event = CreateEvent("C", category, "Allocated Bytes", pid, 0, ts);
97   Json::Value args(Json::objectValue);
98   args["Allocator Bytes in Use"] = Json::Int64(bytes);
99   event["args"] = args;
100   events_.push_back(event);
101 
102   // TODO(xpan): chrome://tracing is not ideal visualization for memory.
103   // It would be great to have a customized UI for it.
104   Json::Value event2 =
105       CreateEvent("C", category, "Top Allocations", pid + 1, 0, ts);
106   Json::Value args2(Json::objectValue);
107   // Need to reserve the same args for all locations.
108   for (int i = 1; i < kMaxDisplayedMemNode; ++i) {
109     args2[absl::StrFormat("Top Allocation %02d", i)] = Json::Value("N/A");
110   }
111   int count = 0;
112   for (auto it = tensor_mem.rbegin(); it != tensor_mem.rend(); ++it) {
113     for (const string& t : it->second) {
114       if (bytes < it->first || count >= kMaxDisplayedMemNode) {
115         break;
116       }
117       args2[absl::StrFormat("Top Allocation %02d", count)] =
118           Json::Value(absl::StrCat(it->first / 1000000.0, " MB from ", t));
119       ++count;
120       bytes -= it->first;
121     }
122   }
123   args2[std::string("Not Displayed")] =
124       Json::Value(absl::StrFormat("%.2f MB", bytes / 1000000.0));
125   event2["args"] = args2;
126   events_.push_back(event2);
127 }
128 
Format()129 string ChromeTraceFormatter::Format() {
130   Json::Value trace;
131   trace["traceEvents"] = Json::Value(Json::arrayValue);
132   for (const Json::Value& v : metadata_) {
133     trace["traceEvents"].append(v);
134   }
135   for (const Json::Value& v : events_) {
136     trace["traceEvents"].append(v);
137   }
138   Json::FastWriter writer;
139   string trace_str = writer.write(trace);
140   if (trace_str.length() > 200 * 1024 * 1024) {
141     absl::FPrintF(stderr,
142                   "Trace file is over 200MB. Chrome might not be able to "
143                   "display it. Consider to use filters (e.g. -min_micros "
144                   "> 1000 or -op_type .*gpu:0.* to reduce the size.\n");
145   }
146   return trace_str;
147 }
148 
TrackNode(int64_t step,const GraphNode * node)149 void MemoryTracker::TrackNode(int64_t step, const GraphNode* node) {
150   if (!node->Trackable(step)) {
151     return;
152   }
153 
154   Device& dev = devices_[node->node->canonical_device()];
155 
156   std::map<int64, int64> allocs;
157   for (const auto& alloc : node->node->allocations(step)) {
158     allocs[alloc.alloc_micros()] += alloc.alloc_bytes();
159     dev.tracked_allocations[alloc.alloc_micros()] += alloc.alloc_bytes();
160   }
161   dev.tracked_allocations[0] += node->node->accelerator_persistent_bytes();
162   allocs[0] += node->node->accelerator_persistent_bytes();
163 
164   int64_t last = 0;
165   std::map<int64, int64>& aggregate_allocs = dev.tensor_allocs[node->name()];
166   for (auto it = allocs.begin(); it != allocs.end(); ++it) {
167     last += it->second;
168     aggregate_allocs[it->first] = last;
169   }
170   for (const auto& bytes_in_use : node->node->allocator_bytes_in_use(step)) {
171     if (bytes_in_use.first <= 0) continue;
172     dev.allocations[bytes_in_use.first] = bytes_in_use.second;
173   }
174 }
175 
AllocateTimeNodes(GraphNode * gnode)176 void Timeline::AllocateTimeNodes(GraphNode* gnode) {
177   if (gnode->Trackable(step_)) {
178     TrackNode(gnode);
179     const TFGraphNode* node = gnode->node;
180     for (const auto& kernel_execs : node->op_execs(step_)) {
181       const string& device = kernel_execs.first;
182 
183       if (process_.find(device) == process_.end()) {
184         int64_t pid = AllocatePID();
185         process_[device].reset(new Process(device, pid));
186         chrome_formatter_.EmitPID(GetTimeDevName(device), pid);
187       }
188       Process* p = process_[device].get();
189 
190       for (const auto& exec : kernel_execs.second) {
191         int64_t start_micros = exec.first;
192         int64_t exec_micros = exec.second;
193         // TODO(xpan): There might be start time duplication here.
194         if (tnodes_[device].find(start_micros) == tnodes_[device].end()) {
195           // TODO(xpan): Give each kernel call a unique_name.
196           tnodes_[device][start_micros].reset(
197               new TimeNode(p, gnode, start_micros, exec_micros));
198         }
199       }
200     }
201   }
202   for (GraphNode* n : gnode->show_children) {
203     AllocateTimeNodes(n);
204   }
205 }
206 
GenerateGraphTimeline(const std::vector<GraphNode * > & gnodes)207 void Timeline::GenerateGraphTimeline(const std::vector<GraphNode*>& gnodes) {
208   for (GraphNode* gnode : gnodes) {
209     AllocateTimeNodes(gnode);
210   }
211   // To save memory, we only track cross-device (canonical device) flows.
212   for (auto& process : tnodes_) {
213     if (!IsCanonicalDevice(process.first)) continue;
214     for (auto& tn : process.second) {
215       TimeNode* tnode = tn.second.get();
216       for (GraphNode* inp : tnode->node->children) {
217         if (!inp->account || !inp->Trackable(step_)) {
218           continue;
219         }
220         for (const auto& execs : inp->node->cpu_execs(step_)) {
221           if (!IsCanonicalDevice(execs.first)) continue;
222           if (process.first == execs.first) {
223             // Not interested in flow within the same device.
224             continue;
225           }
226           for (const auto& exec : execs.second) {
227             int64_t start_micros = exec.first;
228             auto cprocess = tnodes_.find(execs.first);
229             if (cprocess == tnodes_.end()) continue;
230             auto ctn = cprocess->second.find(start_micros);
231             if (ctn == cprocess->second.end()) continue;
232             ctn->second->next_tnodes.push_back(tnode);
233           }
234         }
235       }
236     }
237   }
238 
239   AllocateLanes();
240   absl::FPrintF(stdout, "generating trace file.\n");
241   int64_t flow_id = 1;
242   for (const auto& process : alloc_nodes_) {
243     for (const auto& lane : process.second) {
244       for (const auto& node : lane.second) {
245         TimeNode* tnode = node.second;
246 
247         Json::Value args(Json::objectValue);
248         args["name"] = Json::Value(tnode->name());
249         chrome_formatter_.EmitRegion(node.first, tnode->exec_micros,
250                                      process.first, lane.first, "Op",
251                                      tnode->name(), args);
252         // Flow is a directed arrow pointing from src to dst.
253         // TODO(xpan): Disable flow to reduce json file size for now. Need
254         // to think of a better way to make flow interpretable.
255         for (TimeNode* next_tnode : node.second->next_tnodes) {
256           chrome_formatter_.EmitFlowStart(
257               tnode->name() + "_flow", tnode->start_micros + tnode->exec_micros,
258               process.first, lane.first, flow_id);
259           chrome_formatter_.EmitFlowEnd(
260               tnode->name() + "_flow", next_tnode->start_micros,
261               next_tnode->process->pid, next_tnode->tid, flow_id);
262           flow_id += 1;
263         }
264       }
265     }
266   }
267   for (const auto& dev : mem_tracker_.devices()) {
268     if (IsPlacedOnCPU(dev.first)) {
269       // TODO(xpan): Maybe also support CPU allocator memory tracking.
270       continue;
271     }
272     int64_t pid = AllocatePID();
273     chrome_formatter_.EmitPID(GetMemoryLaneName(dev.first), pid);
274     int64_t pid2 = AllocatePID();
275     chrome_formatter_.EmitPID(GetMemoryLaneName(dev.first) + " allocations",
276                               pid2);
277 
278     const MemoryTracker::Device& device = dev.second;
279 
280     int64_t max_bytes_in_use = 0;
281     int64_t cur_bytes_in_use = 0;
282     int64_t last_point = 0;
283     for (const auto& alloc : device.allocations) {
284       cur_bytes_in_use = alloc.second;
285       max_bytes_in_use = std::max(max_bytes_in_use, cur_bytes_in_use);
286       // Do not plot too dense to reduce file size.
287       int64_t ts = alloc.first;
288       if (ts - last_point < 100) continue;
289       last_point = ts;
290 
291       std::map<int64, std::vector<string>> tensor_mem;
292       for (const auto& tensor_alloc_it : dev.second.tensor_allocs) {
293         const auto& tensor_alloc = tensor_alloc_it.second;
294         auto it = tensor_alloc.lower_bound(ts);
295         if (it != tensor_alloc.begin()) {
296           --it;
297         }
298         if (it->second > 0) {
299           tensor_mem[it->second].push_back(tensor_alloc_it.first);
300         }
301       }
302       chrome_formatter_.EmitCounter("Memory", "Memory Series", pid, ts,
303                                     dev.first, cur_bytes_in_use, tensor_mem);
304     }
305     if (IsPlacedOnAccelerator(dev.first)) {
306       absl::FPrintF(stdout, "%s peak memory: %.2f MB\n", dev.first,
307                     max_bytes_in_use / 1000000.0);
308     }
309   }
310   OutputTimeline();
311 }
312 
GenerateScopeTimeline(const ScopeNode * node)313 void Timeline::GenerateScopeTimeline(const ScopeNode* node) {
314   std::set<int64> visited_depth;
315   EmitTreeNode(node, 0, node->proto().total_exec_micros(), 0, &visited_depth);
316   OutputTimeline();
317 }
318 
GenerateCodeTimeline(const CodeNode * node)319 void Timeline::GenerateCodeTimeline(const CodeNode* node) {
320   std::set<int64> visited_depth;
321   EmitTreeNode(node, 0, node->proto().total_exec_micros(), 0, &visited_depth);
322   OutputTimeline();
323 }
324 
OutputTimeline()325 void Timeline::OutputTimeline() {
326   std::string outfile = absl::StrFormat("%s_%d", outfile_, step());
327   Status s =
328       WriteStringToFile(Env::Default(), outfile, chrome_formatter_.Format());
329   if (!s.ok()) {
330     absl::FPrintF(stderr, "Failed to write timeline file: %s\nError: %s\n",
331                   outfile, s.ToString());
332     return;
333   }
334   absl::FPrintF(stdout,
335                 "\n******************************************************\n");
336   absl::FPrintF(stdout,
337                 "Timeline file is written to %s.\n"
338                 "Open a Chrome browser, enter URL chrome://tracing and "
339                 "load the timeline file.",
340                 outfile);
341   absl::FPrintF(stdout,
342                 "\n******************************************************\n");
343   fflush(stdout);
344 }
345 
AllocateLanes()346 void Timeline::AllocateLanes() {
347   for (auto& process : tnodes_) {
348     Process* p = process_[process.first].get();
349     for (auto& tnode : process.second) {
350       int64_t start_time = tnode.second->start_micros;
351       int64_t end_time = tnode.second->start_micros + tnode.second->exec_micros;
352       int64_t l = -1;
353       for (int64_t i = 0, end = p->lanes.size(); i < end; ++i) {
354         const auto& lane = p->lanes[i];
355         l = i;
356         for (auto cur_it = lane.rbegin(); cur_it != lane.rend(); ++cur_it) {
357           if (cur_it->second > start_time) {
358             l = -1;
359             break;
360           }
361           if (start_time > cur_it->second) {
362             break;
363           }
364         }
365         if (l >= 0) {
366           break;
367         }
368       }
369       if (l < 0) {
370         l = p->lanes.size();
371         std::map<int64, int64> nlane;
372         nlane[start_time] = end_time;
373         p->lanes.push_back(nlane);
374       } else {
375         p->lanes[l][start_time] = end_time;
376       }
377       tnode.second->tid = l;
378       alloc_nodes_[p->pid][l][start_time] = tnode.second.get();
379     }
380   }
381 }
382 
AllocatePID()383 int64 Timeline::AllocatePID() {
384   int64_t cur_pid = next_pid_;
385   next_pid_ += 1;
386   return cur_pid;
387 }
388 
389 }  // namespace tfprof
390 }  // namespace tensorflow
391