• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include "tensorflow/core/profiler/utils/derived_timeline.h"
16 
17 #include <algorithm>
18 #include <utility>
19 #include <vector>
20 
21 #include "absl/algorithm/container.h"
22 #include "absl/container/flat_hash_map.h"
23 #include "absl/strings/match.h"
24 #include "absl/strings/str_cat.h"
25 #include "absl/strings/str_split.h"
26 #include "absl/strings/string_view.h"
27 #include "absl/types/optional.h"
28 #include "tensorflow/core/lib/gtl/map_util.h"
29 #include "tensorflow/core/platform/types.h"
30 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
31 #include "tensorflow/core/profiler/utils/group_events.h"
32 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
33 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
34 #include "tensorflow/core/profiler/utils/time_utils.h"
35 #include "tensorflow/core/profiler/utils/timespan.h"
36 #include "tensorflow/core/profiler/utils/trace_utils.h"
37 #include "tensorflow/core/profiler/utils/xplane_builder.h"
38 #include "tensorflow/core/profiler/utils/xplane_schema.h"
39 #include "tensorflow/core/profiler/utils/xplane_utils.h"
40 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
41 
42 namespace tensorflow {
43 namespace profiler {
44 namespace {
45 
46 const absl::string_view kAnnotationDelimiter = "::";
47 
CreateXEvent(const XEventMetadata & metadata,int64 offset_ps,int64 duration_ps,int64 group_id_stat_metadata_id,absl::optional<int64> group_id)48 XEvent CreateXEvent(const XEventMetadata& metadata, int64 offset_ps,
49                     int64 duration_ps, int64 group_id_stat_metadata_id,
50                     absl::optional<int64> group_id) {
51   XEvent event;
52   event.set_metadata_id(metadata.id());
53   // TODO(b/150498419): Normalize with the line start time.
54   event.set_offset_ps(offset_ps);
55   event.set_duration_ps(duration_ps);
56   if (group_id) {
57     XStat* stat = event.add_stats();
58     stat->set_metadata_id(group_id_stat_metadata_id);
59     stat->set_int64_value(*group_id);
60   }
61   return event;
62 }
63 
64 }  // namespace
65 
ProcessTfOpEvent(absl::string_view tf_op_full_name,int64 offset_ps,int64 duration_ps,absl::optional<int64> group_id,XPlaneBuilder * plane_builder,DerivedXLineBuilder * tf_name_scope_line_builder,DerivedXLineBuilder * tf_op_line_builder)66 void ProcessTfOpEvent(absl::string_view tf_op_full_name, int64 offset_ps,
67                       int64 duration_ps, absl::optional<int64> group_id,
68                       XPlaneBuilder* plane_builder,
69                       DerivedXLineBuilder* tf_name_scope_line_builder,
70                       DerivedXLineBuilder* tf_op_line_builder) {
71   int64 group_id_stat_metadata_id =
72       plane_builder->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kGroupId))
73           ->id();
74   TfOp tf_op = ParseTfOpFullname(tf_op_full_name);
75   Category category = tf_op.category;
76   if (category == Category::kTensorFlow || category == Category::kJax) {
77     std::vector<XEvent> name_scope_event_per_level;
78     for (const auto& tf_name_scope : ParseTfNameScopes(tf_op)) {
79       name_scope_event_per_level.push_back(CreateXEvent(
80           *plane_builder->GetOrCreateEventMetadata(tf_name_scope), offset_ps,
81           duration_ps, group_id_stat_metadata_id, group_id));
82     }
83     tf_name_scope_line_builder->ExpandOrAddEvents(name_scope_event_per_level);
84   }
85   XEventMetadata* tf_op_event_metadata =
86       plane_builder->GetOrCreateEventMetadata(tf_op_full_name);
87   // Set the display name to op_type so that the events of the same op_type have
88   // the same color in the trace viewer.
89   tf_op_event_metadata->set_display_name(TfOpEventName(tf_op));
90   tf_op_line_builder->ExpandOrAddEvent(
91       CreateXEvent(*tf_op_event_metadata, offset_ps, duration_ps,
92                    group_id_stat_metadata_id, group_id));
93 }
94 
DerivedXLineBuilder(XPlaneBuilder * plane,int64 line_id,absl::string_view name,int64 timestamp_ns,std::vector<DerivedXLineBuilder * > dependent_lines)95 DerivedXLineBuilder::DerivedXLineBuilder(
96     XPlaneBuilder* plane, int64 line_id, absl::string_view name,
97     int64 timestamp_ns, std::vector<DerivedXLineBuilder*> dependent_lines)
98     : line_(plane->GetOrCreateLine(line_id)) {
99   line_.SetName(name);
100   line_.SetTimestampNs(timestamp_ns);
101   dependent_lines_ = std::move(dependent_lines);
102 }
103 
ExpandOrAddLevelEvent(const XEvent & event,int level)104 void DerivedXLineBuilder::ExpandOrAddLevelEvent(const XEvent& event,
105                                                 int level) {
106   int64 offset_ps = event.offset_ps();
107   int64 duration_ps = event.duration_ps();
108   auto& last_event = last_event_by_level_[level];
109   // If last_event is not nullptr, its offset must be less than or equal to
110   // the given event's offset.
111   DCHECK(!last_event || last_event->OffsetPs() <= offset_ps);
112   if (last_event && last_event->MetadataId() == event.metadata_id()) {
113     // If last_event is not nullptr and metadata is same, merge the given
114     // event into last_event.
115     last_event->SetDurationPs((offset_ps + duration_ps) -
116                               last_event->OffsetPs());
117   } else {
118     // Otherwise, reset the last events lower than or equal to the given level.
119     ResetLastEvents(level);
120     // And create a new event for the given level.
121     last_event = line_.AddEvent(event);
122   }
123 }
124 
ResetLastEvents(int level)125 void DerivedXLineBuilder::ResetLastEvents(int level) {
126   for (int i = level, end = last_event_by_level_.size(); i < end; ++i) {
127     last_event_by_level_[i] = absl::nullopt;
128   }
129   if (level == 0) ResetDependentLines();
130 }
131 
DeriveEventsFromAnnotations(const SymbolResolver & symbol_resolver,const GroupMetadataMap & group_metadata_map,XPlane * device_trace,bool step_info_only)132 void DeriveEventsFromAnnotations(const SymbolResolver& symbol_resolver,
133                                  const GroupMetadataMap& group_metadata_map,
134                                  XPlane* device_trace, bool step_info_only) {
135   // Merge and sort events by Timespan as they come from different lines.
136   std::vector<XEventVisitor> events;
137   uint64 start_timestamp_ns = 0;
138   XPlaneVisitor device_plane = CreateTfXPlaneVisitor(device_trace);
139   device_plane.ForEachLine([&](const XLineVisitor& line) {
140     if (IsDerivedThreadId(line.Id())) return;  // Skip overhead line.
141     start_timestamp_ns = line.TimestampNs();
142     line.ForEachEvent(
143         [&](const XEventVisitor& event) { events.push_back(event); });
144   });
145   absl::c_sort(events);
146 
147   XPlaneBuilder plane(device_trace);
148   DerivedXLineBuilder tf_ops(&plane, kThreadIdTfOp, kTensorFlowOpLineName,
149                              start_timestamp_ns, {});
150   DerivedXLineBuilder tf_name_scope(&plane, kThreadIdTfNameScope,
151                                     kTensorFlowNameScopeLineName,
152                                     start_timestamp_ns, {&tf_ops});
153   DerivedXLineBuilder hlo_ops(&plane, kThreadIdHloOp, kXlaOpLineName,
154                               start_timestamp_ns, {});
155   DerivedXLineBuilder hlo_modules(&plane, kThreadIdHloModule,
156                                   kXlaModuleLineName, start_timestamp_ns,
157                                   {&tf_name_scope, &hlo_ops});
158   DerivedXLineBuilder steps(&plane, kThreadIdStepInfo, kStepLineName,
159                             start_timestamp_ns, {&hlo_modules});
160   int64 group_id_stat_metadata_id =
161       plane.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kGroupId))->id();
162   int64 step_name_stat_metadata_id =
163       plane.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kStepName))->id();
164 
165   // Process events in order by start time.
166   for (const XEventVisitor& event : events) {
167     int64 offset_ps = event.OffsetPs();
168     int64 duration_ps = event.DurationPs();
169     absl::string_view tf_op_full_name;
170     absl::string_view hlo_module_name;
171     std::vector<absl::string_view> hlo_op_names;
172     absl::optional<int64> group_id;
173     bool is_kernel = false;
174     event.ForEachStat([&](const XStatVisitor& stat) {
175       if (stat.Type() == StatType::kGroupId) {
176         group_id = stat.IntValue();
177       } else if (stat.Type() == StatType::kLevel0 ||  // old way to carry tf_op
178                  stat.Type() == StatType::kTfOp) {
179         tf_op_full_name = stat.StrOrRefValue();
180       } else if (stat.Type() == StatType::kHloOp) {
181         hlo_op_names =
182             absl::StrSplit(stat.StrOrRefValue(), kAnnotationDelimiter);
183       } else if (stat.Type() == StatType::kHloModule) {
184         hlo_module_name = stat.StrOrRefValue();
185       } else if (stat.Type() == StatType::kKernelDetails) {
186         is_kernel = true;
187       }
188     });
189 
190     if (group_id) {
191       XEvent step_event = CreateXEvent(
192           *plane.GetOrCreateEventMetadata(absl::StrCat(*group_id)), offset_ps,
193           duration_ps, group_id_stat_metadata_id, group_id);
194       if (auto group_metadata =
195               gtl::FindOrNull(group_metadata_map, *group_id)) {
196         XStat* stat = step_event.add_stats();
197         stat->set_metadata_id(step_name_stat_metadata_id);
198         stat->set_str_value(group_metadata->name);
199       }
200       steps.ExpandOrAddEvent(step_event);
201     }
202 
203     if (step_info_only) continue;
204 
205     // For HLO/TF op lines, only use kernel events (i.e. excluding memcpy or
206     // allocation events).
207     if (!is_kernel) continue;
208 
209     if (!hlo_module_name.empty()) {
210       hlo_modules.ExpandOrAddEvent(CreateXEvent(
211           *plane.GetOrCreateEventMetadata(hlo_module_name), offset_ps,
212           duration_ps, group_id_stat_metadata_id, group_id));
213     }
214 
215     if (!hlo_op_names.empty()) {  // GPU kernel compiled by XLA
216       DCHECK(!hlo_module_name.empty());
217       std::vector<XEvent> hlo_op_event_per_level;
218       for (absl::string_view hlo_op_name : hlo_op_names) {
219         DCHECK(!hlo_op_name.empty());
220         hlo_op_event_per_level.push_back(CreateXEvent(
221             *plane.GetOrCreateEventMetadata(hlo_op_name), offset_ps,
222             duration_ps, group_id_stat_metadata_id, group_id));
223       }
224       hlo_ops.ExpandOrAddEvents(hlo_op_event_per_level);
225       auto tf_op_name = symbol_resolver(hlo_module_name, hlo_op_names.back());
226       if (!tf_op_name.empty()) {
227         ProcessTfOpEvent(tf_op_name, offset_ps, duration_ps, group_id, &plane,
228                          &tf_name_scope, &tf_ops);
229       }
230     } else if (!tf_op_full_name.empty()) {  // GPU kernel not compiled by XLA
231       ProcessTfOpEvent(tf_op_full_name, offset_ps, duration_ps, group_id,
232                        &plane, &tf_name_scope, &tf_ops);
233     }
234   }
235   RemoveEmptyLines(device_trace);
236 }
237 
DeriveEventsFromHostTrace(const XPlane * host_trace,const GroupMetadataMap & group_metadata_map,std::vector<XPlane * > device_traces)238 void DeriveEventsFromHostTrace(const XPlane* host_trace,
239                                const GroupMetadataMap& group_metadata_map,
240                                std::vector<XPlane*> device_traces) {
241   struct GroupLaunchInfo {  // "Group" normally means step.
242     Timespan timespan;
243     int32 num_launches = 0;
244     uint64 max_launch_time_ps = 0ULL;
245     uint64 total_launch_time_ps = 0ULL;
246   };
247   typedef absl::flat_hash_map<int64 /*group_id*/, GroupLaunchInfo>
248       DeviceLaunchInfo;
249 
250   int num_devices = device_traces.size();
251   std::vector<DeviceLaunchInfo> per_device_launch_info(num_devices);
252 
253   XPlaneVisitor host_plane = CreateTfXPlaneVisitor(host_trace);
254   host_plane.ForEachLine([&](const XLineVisitor& line) {
255     if (IsDerivedThreadId(line.Id())) return;
256     line.ForEachEvent([&](const XEventVisitor& event) {
257       absl::optional<int64> group_id;
258       absl::optional<int64> device_id;
259       absl::optional<int64> correlation_id;
260       // Filter out API calls for cuEventRecord/cuEventQuery/cuCtxSynchronize
261       // etc for now. TODO: find a better way to filter out only the memcpy and
262       // kernel launch events.
263       if (absl::StartsWith(event.Name(), "cu")) return;
264       event.ForEachStat([&](const XStatVisitor& stat) {
265         if (stat.Type() == StatType::kGroupId) {
266           group_id = stat.IntValue();
267         } else if (stat.Type() == StatType::kDeviceId) {
268           device_id = stat.IntOrUintValue();
269         } else if (stat.Type() == StatType::kCorrelationId) {
270           correlation_id = stat.IntValue();
271         }
272       });
273       if (group_id && device_id && correlation_id && *device_id >= 0 &&
274           *device_id < num_devices) {
275         // This is a launch event on a known device.
276         GroupLaunchInfo& group_launch_info =
277             per_device_launch_info[*device_id][*group_id];
278         Timespan& group_span = group_launch_info.timespan;
279         Timespan event_span = event.GetTimespan();
280         if (group_launch_info.num_launches) {  // Existing group.
281           group_span.ExpandToInclude(event_span);
282         } else {
283           group_span = event_span;
284         }
285         ++group_launch_info.num_launches;
286         group_launch_info.max_launch_time_ps = std::max(
287             group_launch_info.max_launch_time_ps, event_span.duration_ps());
288         group_launch_info.total_launch_time_ps += event_span.duration_ps();
289       }
290     });
291   });
292 
293   uint64 host_plane_start = GetStartTimestampNs(*host_trace);
294   for (int i = 0; i < num_devices; ++i) {
295     if (per_device_launch_info[i].empty()) continue;
296     uint64 device_plane_start = GetStartTimestampNs(*device_traces[i]);
297     XPlaneBuilder device_plane(device_traces[i]);
298     XLineBuilder launch_line =
299         device_plane.GetOrCreateLine(kThreadIdKernelLaunch);
300     launch_line.SetName(kKernelLaunchLineName);
301     launch_line.SetTimestampNs(std::min(device_plane_start, host_plane_start));
302     for (const auto& kv : per_device_launch_info[i]) {
303       int64 group_id = kv.first;
304       const GroupLaunchInfo& group_info = kv.second;
305       if (auto group_metadata = gtl::FindOrNull(group_metadata_map, group_id)) {
306         XEventBuilder device_event =
307             launch_line.AddEvent(*device_plane.GetOrCreateEventMetadata(
308                 absl::StrCat("Launch Stats for ", group_metadata->name)));
309         device_event.SetTimestampNs(
310             host_plane_start + PicosToNanos(group_info.timespan.begin_ps()));
311         device_event.SetDurationPs(group_info.timespan.duration_ps());
312         device_event.AddStatValue(*device_plane.GetOrCreateStatMetadata(
313                                       GetStatTypeStr(StatType::kGroupId)),
314                                   group_id);
315         device_event.AddStatValue(
316             *device_plane.GetOrCreateStatMetadata("num_launches"),
317             group_info.num_launches);
318         device_event.AddStatValue(
319             *device_plane.GetOrCreateStatMetadata("max_launch_time_us"),
320             PicosToMicros(group_info.max_launch_time_ps));
321         device_event.AddStatValue(
322             *device_plane.GetOrCreateStatMetadata("avg_launch_time_us"),
323             PicosToMicros(group_info.total_launch_time_ps /
324                           group_info.num_launches));
325       }
326     }
327   }
328 }
329 
GenerateDerivedTimeLines(const GroupMetadataMap & group_metadata_map,XSpace * space,bool step_info_only)330 void GenerateDerivedTimeLines(const GroupMetadataMap& group_metadata_map,
331                               XSpace* space, bool step_info_only) {
332   // TODO(profiler): Once we capture HLO protos for xla/gpu, we should use that
333   // to look up tensorflow op name from hlo_module/hlo_op.
334   auto dummy_symbol_resolver = [](absl::string_view hlo_module,
335                                   absl::string_view hlo_op) {
336     return absl::string_view();
337   };
338   std::vector<XPlane*> device_traces =
339       FindMutablePlanesWithPrefix(space, kGpuPlanePrefix);
340   for (XPlane* plane : device_traces) {
341     DeriveEventsFromAnnotations(dummy_symbol_resolver, group_metadata_map,
342                                 plane, step_info_only);
343   }
344 }
345 
346 }  // namespace profiler
347 }  // namespace tensorflow
348