1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include "tensorflow/core/profiler/utils/derived_timeline.h"
16
17 #include <algorithm>
18 #include <utility>
19 #include <vector>
20
21 #include "absl/algorithm/container.h"
22 #include "absl/container/flat_hash_map.h"
23 #include "absl/strings/match.h"
24 #include "absl/strings/str_cat.h"
25 #include "absl/strings/str_split.h"
26 #include "absl/strings/string_view.h"
27 #include "absl/types/optional.h"
28 #include "tensorflow/core/lib/gtl/map_util.h"
29 #include "tensorflow/core/platform/types.h"
30 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
31 #include "tensorflow/core/profiler/utils/group_events.h"
32 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
33 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
34 #include "tensorflow/core/profiler/utils/time_utils.h"
35 #include "tensorflow/core/profiler/utils/timespan.h"
36 #include "tensorflow/core/profiler/utils/trace_utils.h"
37 #include "tensorflow/core/profiler/utils/xplane_builder.h"
38 #include "tensorflow/core/profiler/utils/xplane_schema.h"
39 #include "tensorflow/core/profiler/utils/xplane_utils.h"
40 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
41
42 namespace tensorflow {
43 namespace profiler {
44 namespace {
45
46 const absl::string_view kAnnotationDelimiter = "::";
47
CreateXEvent(const XEventMetadata & metadata,int64 offset_ps,int64 duration_ps,int64 group_id_stat_metadata_id,absl::optional<int64> group_id)48 XEvent CreateXEvent(const XEventMetadata& metadata, int64 offset_ps,
49 int64 duration_ps, int64 group_id_stat_metadata_id,
50 absl::optional<int64> group_id) {
51 XEvent event;
52 event.set_metadata_id(metadata.id());
53 // TODO(b/150498419): Normalize with the line start time.
54 event.set_offset_ps(offset_ps);
55 event.set_duration_ps(duration_ps);
56 if (group_id) {
57 XStat* stat = event.add_stats();
58 stat->set_metadata_id(group_id_stat_metadata_id);
59 stat->set_int64_value(*group_id);
60 }
61 return event;
62 }
63
64 } // namespace
65
ProcessTfOpEvent(absl::string_view tf_op_full_name,int64 offset_ps,int64 duration_ps,absl::optional<int64> group_id,XPlaneBuilder * plane_builder,DerivedXLineBuilder * tf_name_scope_line_builder,DerivedXLineBuilder * tf_op_line_builder)66 void ProcessTfOpEvent(absl::string_view tf_op_full_name, int64 offset_ps,
67 int64 duration_ps, absl::optional<int64> group_id,
68 XPlaneBuilder* plane_builder,
69 DerivedXLineBuilder* tf_name_scope_line_builder,
70 DerivedXLineBuilder* tf_op_line_builder) {
71 int64 group_id_stat_metadata_id =
72 plane_builder->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kGroupId))
73 ->id();
74 TfOp tf_op = ParseTfOpFullname(tf_op_full_name);
75 Category category = tf_op.category;
76 if (category == Category::kTensorFlow || category == Category::kJax) {
77 std::vector<XEvent> name_scope_event_per_level;
78 for (const auto& tf_name_scope : ParseTfNameScopes(tf_op)) {
79 name_scope_event_per_level.push_back(CreateXEvent(
80 *plane_builder->GetOrCreateEventMetadata(tf_name_scope), offset_ps,
81 duration_ps, group_id_stat_metadata_id, group_id));
82 }
83 tf_name_scope_line_builder->ExpandOrAddEvents(name_scope_event_per_level);
84 }
85 XEventMetadata* tf_op_event_metadata =
86 plane_builder->GetOrCreateEventMetadata(tf_op_full_name);
87 // Set the display name to op_type so that the events of the same op_type have
88 // the same color in the trace viewer.
89 tf_op_event_metadata->set_display_name(TfOpEventName(tf_op));
90 tf_op_line_builder->ExpandOrAddEvent(
91 CreateXEvent(*tf_op_event_metadata, offset_ps, duration_ps,
92 group_id_stat_metadata_id, group_id));
93 }
94
DerivedXLineBuilder(XPlaneBuilder * plane,int64 line_id,absl::string_view name,int64 timestamp_ns,std::vector<DerivedXLineBuilder * > dependent_lines)95 DerivedXLineBuilder::DerivedXLineBuilder(
96 XPlaneBuilder* plane, int64 line_id, absl::string_view name,
97 int64 timestamp_ns, std::vector<DerivedXLineBuilder*> dependent_lines)
98 : line_(plane->GetOrCreateLine(line_id)) {
99 line_.SetName(name);
100 line_.SetTimestampNs(timestamp_ns);
101 dependent_lines_ = std::move(dependent_lines);
102 }
103
ExpandOrAddLevelEvent(const XEvent & event,int level)104 void DerivedXLineBuilder::ExpandOrAddLevelEvent(const XEvent& event,
105 int level) {
106 int64 offset_ps = event.offset_ps();
107 int64 duration_ps = event.duration_ps();
108 auto& last_event = last_event_by_level_[level];
109 // If last_event is not nullptr, its offset must be less than or equal to
110 // the given event's offset.
111 DCHECK(!last_event || last_event->OffsetPs() <= offset_ps);
112 if (last_event && last_event->MetadataId() == event.metadata_id()) {
113 // If last_event is not nullptr and metadata is same, merge the given
114 // event into last_event.
115 last_event->SetDurationPs((offset_ps + duration_ps) -
116 last_event->OffsetPs());
117 } else {
118 // Otherwise, reset the last events lower than or equal to the given level.
119 ResetLastEvents(level);
120 // And create a new event for the given level.
121 last_event = line_.AddEvent(event);
122 }
123 }
124
ResetLastEvents(int level)125 void DerivedXLineBuilder::ResetLastEvents(int level) {
126 for (int i = level, end = last_event_by_level_.size(); i < end; ++i) {
127 last_event_by_level_[i] = absl::nullopt;
128 }
129 if (level == 0) ResetDependentLines();
130 }
131
DeriveEventsFromAnnotations(const SymbolResolver & symbol_resolver,const GroupMetadataMap & group_metadata_map,XPlane * device_trace,bool step_info_only)132 void DeriveEventsFromAnnotations(const SymbolResolver& symbol_resolver,
133 const GroupMetadataMap& group_metadata_map,
134 XPlane* device_trace, bool step_info_only) {
135 // Merge and sort events by Timespan as they come from different lines.
136 std::vector<XEventVisitor> events;
137 uint64 start_timestamp_ns = 0;
138 XPlaneVisitor device_plane = CreateTfXPlaneVisitor(device_trace);
139 device_plane.ForEachLine([&](const XLineVisitor& line) {
140 if (IsDerivedThreadId(line.Id())) return; // Skip overhead line.
141 start_timestamp_ns = line.TimestampNs();
142 line.ForEachEvent(
143 [&](const XEventVisitor& event) { events.push_back(event); });
144 });
145 absl::c_sort(events);
146
147 XPlaneBuilder plane(device_trace);
148 DerivedXLineBuilder tf_ops(&plane, kThreadIdTfOp, kTensorFlowOpLineName,
149 start_timestamp_ns, {});
150 DerivedXLineBuilder tf_name_scope(&plane, kThreadIdTfNameScope,
151 kTensorFlowNameScopeLineName,
152 start_timestamp_ns, {&tf_ops});
153 DerivedXLineBuilder hlo_ops(&plane, kThreadIdHloOp, kXlaOpLineName,
154 start_timestamp_ns, {});
155 DerivedXLineBuilder hlo_modules(&plane, kThreadIdHloModule,
156 kXlaModuleLineName, start_timestamp_ns,
157 {&tf_name_scope, &hlo_ops});
158 DerivedXLineBuilder steps(&plane, kThreadIdStepInfo, kStepLineName,
159 start_timestamp_ns, {&hlo_modules});
160 int64 group_id_stat_metadata_id =
161 plane.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kGroupId))->id();
162 int64 step_name_stat_metadata_id =
163 plane.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kStepName))->id();
164
165 // Process events in order by start time.
166 for (const XEventVisitor& event : events) {
167 int64 offset_ps = event.OffsetPs();
168 int64 duration_ps = event.DurationPs();
169 absl::string_view tf_op_full_name;
170 absl::string_view hlo_module_name;
171 std::vector<absl::string_view> hlo_op_names;
172 absl::optional<int64> group_id;
173 bool is_kernel = false;
174 event.ForEachStat([&](const XStatVisitor& stat) {
175 if (stat.Type() == StatType::kGroupId) {
176 group_id = stat.IntValue();
177 } else if (stat.Type() == StatType::kLevel0 || // old way to carry tf_op
178 stat.Type() == StatType::kTfOp) {
179 tf_op_full_name = stat.StrOrRefValue();
180 } else if (stat.Type() == StatType::kHloOp) {
181 hlo_op_names =
182 absl::StrSplit(stat.StrOrRefValue(), kAnnotationDelimiter);
183 } else if (stat.Type() == StatType::kHloModule) {
184 hlo_module_name = stat.StrOrRefValue();
185 } else if (stat.Type() == StatType::kKernelDetails) {
186 is_kernel = true;
187 }
188 });
189
190 if (group_id) {
191 XEvent step_event = CreateXEvent(
192 *plane.GetOrCreateEventMetadata(absl::StrCat(*group_id)), offset_ps,
193 duration_ps, group_id_stat_metadata_id, group_id);
194 if (auto group_metadata =
195 gtl::FindOrNull(group_metadata_map, *group_id)) {
196 XStat* stat = step_event.add_stats();
197 stat->set_metadata_id(step_name_stat_metadata_id);
198 stat->set_str_value(group_metadata->name);
199 }
200 steps.ExpandOrAddEvent(step_event);
201 }
202
203 if (step_info_only) continue;
204
205 // For HLO/TF op lines, only use kernel events (i.e. excluding memcpy or
206 // allocation events).
207 if (!is_kernel) continue;
208
209 if (!hlo_module_name.empty()) {
210 hlo_modules.ExpandOrAddEvent(CreateXEvent(
211 *plane.GetOrCreateEventMetadata(hlo_module_name), offset_ps,
212 duration_ps, group_id_stat_metadata_id, group_id));
213 }
214
215 if (!hlo_op_names.empty()) { // GPU kernel compiled by XLA
216 DCHECK(!hlo_module_name.empty());
217 std::vector<XEvent> hlo_op_event_per_level;
218 for (absl::string_view hlo_op_name : hlo_op_names) {
219 DCHECK(!hlo_op_name.empty());
220 hlo_op_event_per_level.push_back(CreateXEvent(
221 *plane.GetOrCreateEventMetadata(hlo_op_name), offset_ps,
222 duration_ps, group_id_stat_metadata_id, group_id));
223 }
224 hlo_ops.ExpandOrAddEvents(hlo_op_event_per_level);
225 auto tf_op_name = symbol_resolver(hlo_module_name, hlo_op_names.back());
226 if (!tf_op_name.empty()) {
227 ProcessTfOpEvent(tf_op_name, offset_ps, duration_ps, group_id, &plane,
228 &tf_name_scope, &tf_ops);
229 }
230 } else if (!tf_op_full_name.empty()) { // GPU kernel not compiled by XLA
231 ProcessTfOpEvent(tf_op_full_name, offset_ps, duration_ps, group_id,
232 &plane, &tf_name_scope, &tf_ops);
233 }
234 }
235 RemoveEmptyLines(device_trace);
236 }
237
DeriveEventsFromHostTrace(const XPlane * host_trace,const GroupMetadataMap & group_metadata_map,std::vector<XPlane * > device_traces)238 void DeriveEventsFromHostTrace(const XPlane* host_trace,
239 const GroupMetadataMap& group_metadata_map,
240 std::vector<XPlane*> device_traces) {
241 struct GroupLaunchInfo { // "Group" normally means step.
242 Timespan timespan;
243 int32 num_launches = 0;
244 uint64 max_launch_time_ps = 0ULL;
245 uint64 total_launch_time_ps = 0ULL;
246 };
247 typedef absl::flat_hash_map<int64 /*group_id*/, GroupLaunchInfo>
248 DeviceLaunchInfo;
249
250 int num_devices = device_traces.size();
251 std::vector<DeviceLaunchInfo> per_device_launch_info(num_devices);
252
253 XPlaneVisitor host_plane = CreateTfXPlaneVisitor(host_trace);
254 host_plane.ForEachLine([&](const XLineVisitor& line) {
255 if (IsDerivedThreadId(line.Id())) return;
256 line.ForEachEvent([&](const XEventVisitor& event) {
257 absl::optional<int64> group_id;
258 absl::optional<int64> device_id;
259 absl::optional<int64> correlation_id;
260 // Filter out API calls for cuEventRecord/cuEventQuery/cuCtxSynchronize
261 // etc for now. TODO: find a better way to filter out only the memcpy and
262 // kernel launch events.
263 if (absl::StartsWith(event.Name(), "cu")) return;
264 event.ForEachStat([&](const XStatVisitor& stat) {
265 if (stat.Type() == StatType::kGroupId) {
266 group_id = stat.IntValue();
267 } else if (stat.Type() == StatType::kDeviceId) {
268 device_id = stat.IntOrUintValue();
269 } else if (stat.Type() == StatType::kCorrelationId) {
270 correlation_id = stat.IntValue();
271 }
272 });
273 if (group_id && device_id && correlation_id && *device_id >= 0 &&
274 *device_id < num_devices) {
275 // This is a launch event on a known device.
276 GroupLaunchInfo& group_launch_info =
277 per_device_launch_info[*device_id][*group_id];
278 Timespan& group_span = group_launch_info.timespan;
279 Timespan event_span = event.GetTimespan();
280 if (group_launch_info.num_launches) { // Existing group.
281 group_span.ExpandToInclude(event_span);
282 } else {
283 group_span = event_span;
284 }
285 ++group_launch_info.num_launches;
286 group_launch_info.max_launch_time_ps = std::max(
287 group_launch_info.max_launch_time_ps, event_span.duration_ps());
288 group_launch_info.total_launch_time_ps += event_span.duration_ps();
289 }
290 });
291 });
292
293 uint64 host_plane_start = GetStartTimestampNs(*host_trace);
294 for (int i = 0; i < num_devices; ++i) {
295 if (per_device_launch_info[i].empty()) continue;
296 uint64 device_plane_start = GetStartTimestampNs(*device_traces[i]);
297 XPlaneBuilder device_plane(device_traces[i]);
298 XLineBuilder launch_line =
299 device_plane.GetOrCreateLine(kThreadIdKernelLaunch);
300 launch_line.SetName(kKernelLaunchLineName);
301 launch_line.SetTimestampNs(std::min(device_plane_start, host_plane_start));
302 for (const auto& kv : per_device_launch_info[i]) {
303 int64 group_id = kv.first;
304 const GroupLaunchInfo& group_info = kv.second;
305 if (auto group_metadata = gtl::FindOrNull(group_metadata_map, group_id)) {
306 XEventBuilder device_event =
307 launch_line.AddEvent(*device_plane.GetOrCreateEventMetadata(
308 absl::StrCat("Launch Stats for ", group_metadata->name)));
309 device_event.SetTimestampNs(
310 host_plane_start + PicosToNanos(group_info.timespan.begin_ps()));
311 device_event.SetDurationPs(group_info.timespan.duration_ps());
312 device_event.AddStatValue(*device_plane.GetOrCreateStatMetadata(
313 GetStatTypeStr(StatType::kGroupId)),
314 group_id);
315 device_event.AddStatValue(
316 *device_plane.GetOrCreateStatMetadata("num_launches"),
317 group_info.num_launches);
318 device_event.AddStatValue(
319 *device_plane.GetOrCreateStatMetadata("max_launch_time_us"),
320 PicosToMicros(group_info.max_launch_time_ps));
321 device_event.AddStatValue(
322 *device_plane.GetOrCreateStatMetadata("avg_launch_time_us"),
323 PicosToMicros(group_info.total_launch_time_ps /
324 group_info.num_launches));
325 }
326 }
327 }
328 }
329
GenerateDerivedTimeLines(const GroupMetadataMap & group_metadata_map,XSpace * space,bool step_info_only)330 void GenerateDerivedTimeLines(const GroupMetadataMap& group_metadata_map,
331 XSpace* space, bool step_info_only) {
332 // TODO(profiler): Once we capture HLO protos for xla/gpu, we should use that
333 // to look up tensorflow op name from hlo_module/hlo_op.
334 auto dummy_symbol_resolver = [](absl::string_view hlo_module,
335 absl::string_view hlo_op) {
336 return absl::string_view();
337 };
338 std::vector<XPlane*> device_traces =
339 FindMutablePlanesWithPrefix(space, kGpuPlanePrefix);
340 for (XPlane* plane : device_traces) {
341 DeriveEventsFromAnnotations(dummy_symbol_resolver, group_metadata_map,
342 plane, step_info_only);
343 }
344 }
345
346 } // namespace profiler
347 } // namespace tensorflow
348