1 /* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/profiler/convert/xplane_to_step_stats.h"
17
18 #include <cstdint>
19 #include <string>
20
21 #include "absl/strings/str_cat.h"
22 #include "absl/strings/string_view.h"
23 #include "absl/strings/strip.h"
24 #include "tensorflow/core/framework/step_stats.pb.h"
25 #include "tensorflow/core/platform/types.h"
26 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
27 #include "tensorflow/core/profiler/utils/parse_annotation.h"
28 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
29 #include "tensorflow/core/profiler/utils/time_utils.h"
30 #include "tensorflow/core/profiler/utils/xplane_schema.h"
31 #include "tensorflow/core/profiler/utils/xplane_utils.h"
32 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
33
34 namespace tensorflow {
35 namespace profiler {
36 namespace {
37
38 struct CorrelationInfo {
39 uint32_t thread_id;
40 uint64_t enqueue_time_ns;
41 };
42
43 enum GpuEventType {
44 kUnknown,
45 kKernel,
46 kMemcpyH2D,
47 kMemcpyD2H,
48 kMemcpyD2D,
49 kMemcpyP2P,
50 };
51
ParseMemcpyName(absl::string_view memcpy_name)52 GpuEventType ParseMemcpyName(absl::string_view memcpy_name) {
53 if (absl::ConsumePrefix(&memcpy_name, "Memcpy")) {
54 if (memcpy_name == "H2D") return GpuEventType::kMemcpyH2D;
55 if (memcpy_name == "D2H") return GpuEventType::kMemcpyD2H;
56 if (memcpy_name == "D2D") return GpuEventType::kMemcpyD2D;
57 if (memcpy_name == "P2P") return GpuEventType::kMemcpyP2P;
58 }
59 return GpuEventType::kUnknown;
60 }
61
SetNodeTimes(const XEventVisitor & event,NodeExecStats * ns)62 void SetNodeTimes(const XEventVisitor& event, NodeExecStats* ns) {
63 ns->set_all_start_micros(NanosToMicros(event.TimestampNs()));
64 ns->set_op_start_rel_micros(0);
65 ns->set_op_end_rel_micros(NanosToMicros(event.DurationNs()));
66 ns->set_all_end_rel_micros(NanosToMicros(event.DurationNs()));
67 }
68
69 } // namespace
70
ConvertGpuXSpaceToStepStats(const XSpace & xspace,StepStats * step_stats)71 void ConvertGpuXSpaceToStepStats(const XSpace& xspace, StepStats* step_stats) {
72 std::vector<const XPlane*> device_planes =
73 FindPlanesWithPrefix(xspace, kGpuPlanePrefix);
74 if (device_planes.empty()) {
75 LOG(WARNING) << "GPU trace was not collected.";
76 return;
77 }
78 std::vector<const XPlane*> host_planes = FindPlanesWithNames(
79 xspace, {kCuptiDriverApiPlaneName, kRoctracerApiPlaneName});
80 DCHECK_LE(host_planes.size(), 1);
81
82 absl::flat_hash_map<int64_t /*correlation_id*/, CorrelationInfo>
83 correlation_info_map;
84 for (const XPlane* host_plane : host_planes) {
85 absl::flat_hash_map<uint32_t /*device_id*/, DeviceStepStats*>
86 sync_dev_stats_map;
87 XPlaneVisitor plane = CreateTfXPlaneVisitor(host_plane);
88 plane.ForEachLine([&](const XLineVisitor& line) {
89 uint32_t thread_id = line.Id();
90 line.ForEachEvent([&](const XEventVisitor& event) {
91 if (event.Name() == "cuStreamSynchronize") {
92 auto device_id_stat = event.GetStat(StatType::kDeviceId);
93 if (device_id_stat.has_value()) {
94 uint32_t device_ordinal = device_id_stat->IntOrUintValue();
95 DeviceStepStats* sync_dev_stats =
96 sync_dev_stats_map[device_ordinal];
97 if (sync_dev_stats == nullptr) {
98 sync_dev_stats = step_stats->add_dev_stats();
99 sync_dev_stats->set_device(
100 absl::StrCat("/device:GPU:", device_ordinal, "/sync"));
101 }
102 NodeExecStats* ns = sync_dev_stats->add_node_stats();
103 SetNodeTimes(event, ns);
104 ns->set_node_name(std::string(event.Name()));
105 ns->set_timeline_label(absl::StrCat("ThreadId ", thread_id));
106 ns->set_thread_id(thread_id);
107 }
108 } else {
109 auto correlation_id_stat = event.GetStat(StatType::kCorrelationId);
110 if (correlation_id_stat.has_value()) {
111 int64_t correlation_id = correlation_id_stat->IntValue();
112 uint64_t enqueue_time_ns = event.TimestampNs();
113 correlation_info_map[correlation_id] = {thread_id, enqueue_time_ns};
114 }
115 }
116 });
117 });
118 }
119 for (const XPlane* device_plane : device_planes) {
120 absl::flat_hash_map<std::pair<int64_t /*stream_id*/, GpuEventType>,
121 DeviceStepStats*>
122 stream_dev_stats_map;
123 DeviceStepStats* unknown_stream_dev_stats = nullptr;
124 DeviceStepStats* all_streams_dev_stats = nullptr;
125 DeviceStepStats* memcpy_dev_stats = nullptr;
126 XPlaneVisitor plane = CreateTfXPlaneVisitor(device_plane);
127 uint32_t device_ordinal = plane.Id();
128 plane.ForEachLine([&](const XLineVisitor& line) {
129 uint32_t stream_id = line.Id();
130 line.ForEachEvent([&](const XEventVisitor& event) {
131 int64_t correlation_id = -1;
132 absl::string_view annotation;
133 absl::string_view kernel_details;
134 absl::string_view memcpy_details;
135 event.ForEachStat([&](const XStatVisitor& stat) {
136 if (!stat.Type().has_value()) return;
137 switch (stat.Type().value()) {
138 case StatType::kCorrelationId:
139 correlation_id = stat.IntValue();
140 break;
141 case StatType::kKernelAnnotation:
142 annotation = stat.StrOrRefValue();
143 break;
144 case StatType::kKernelDetails:
145 kernel_details = stat.StrOrRefValue();
146 break;
147 case StatType::kMemcpyDetails:
148 memcpy_details = stat.StrOrRefValue();
149 break;
150 default:
151 break;
152 }
153 });
154
155 auto ns = absl::make_unique<NodeExecStats>();
156 SetNodeTimes(event, ns.get());
157
158 // Get launch information if available.
159 if (correlation_id > 0) {
160 auto it = correlation_info_map.find(correlation_id);
161 if (it != correlation_info_map.end()) {
162 const CorrelationInfo& correlation_info = it->second;
163 ns->set_scheduled_micros(
164 NanosToMicros(correlation_info.enqueue_time_ns));
165 ns->set_thread_id(correlation_info.thread_id);
166 }
167 }
168
169 absl::string_view node_name = event.Name();
170 if (!annotation.empty()) {
171 auto annotation_stack = ParseAnnotationStack(annotation);
172 if (!annotation_stack.empty()) {
173 node_name = annotation_stack.back().name;
174 }
175 }
176 ns->set_node_name(std::string(node_name));
177
178 if (!kernel_details.empty()) {
179 absl::string_view kernel_name = event.Name();
180 ns->set_timeline_label(
181 absl::StrCat(kernel_name, " ", kernel_details));
182 DeviceStepStats*& stream_dev_stats =
183 stream_dev_stats_map[{stream_id, GpuEventType::kKernel}];
184 if (stream_dev_stats == nullptr) {
185 stream_dev_stats = step_stats->add_dev_stats();
186 stream_dev_stats->set_device(absl::StrCat(
187 "/device:GPU:", device_ordinal, "/stream:", stream_id));
188 }
189 *stream_dev_stats->add_node_stats() = *ns;
190 if (all_streams_dev_stats == nullptr) {
191 all_streams_dev_stats = step_stats->add_dev_stats();
192 all_streams_dev_stats->set_device(
193 absl::StrCat("/device:GPU:", device_ordinal, "/stream:all"));
194 }
195 all_streams_dev_stats->add_node_stats()->Swap(ns.get());
196
197 } else if (!memcpy_details.empty()) {
198 absl::string_view memcpy_name = event.Name();
199 ns->set_timeline_label(
200 absl::StrCat(memcpy_name, " ", memcpy_details));
201 GpuEventType gpu_event_type = ParseMemcpyName(memcpy_name);
202 DCHECK_NE(gpu_event_type, GpuEventType::kUnknown);
203 DeviceStepStats*& stream_dev_stats =
204 stream_dev_stats_map[{stream_id, gpu_event_type}];
205 if (stream_dev_stats == nullptr) {
206 stream_dev_stats = step_stats->add_dev_stats();
207 stream_dev_stats->set_device(
208 absl::StrCat("/device:GPU:", device_ordinal,
209 "/stream:", stream_id, "<", memcpy_name, ">"));
210 }
211 *stream_dev_stats->add_node_stats() = *ns;
212 if (memcpy_dev_stats == nullptr) {
213 memcpy_dev_stats = step_stats->add_dev_stats();
214 memcpy_dev_stats->set_device(
215 absl::StrCat("/device:GPU:", device_ordinal, "/memcpy"));
216 }
217 memcpy_dev_stats->add_node_stats()->Swap(ns.get());
218
219 } else {
220 ns->set_timeline_label(std::string(node_name));
221 if (unknown_stream_dev_stats == nullptr) {
222 unknown_stream_dev_stats = step_stats->add_dev_stats();
223 unknown_stream_dev_stats->set_device(
224 absl::StrCat("/device:GPU:", device_ordinal, "/stream:"));
225 }
226 unknown_stream_dev_stats->add_node_stats()->Swap(ns.get());
227 }
228 });
229 });
230 }
231 }
232
233 } // namespace profiler
234 } // namespace tensorflow
235