• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/profiler/convert/xplane_to_step_stats.h"
17 
18 #include <cstdint>
19 #include <string>
20 
21 #include "absl/strings/str_cat.h"
22 #include "absl/strings/string_view.h"
23 #include "absl/strings/strip.h"
24 #include "tensorflow/core/framework/step_stats.pb.h"
25 #include "tensorflow/core/platform/types.h"
26 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
27 #include "tensorflow/core/profiler/utils/parse_annotation.h"
28 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
29 #include "tensorflow/core/profiler/utils/time_utils.h"
30 #include "tensorflow/core/profiler/utils/xplane_schema.h"
31 #include "tensorflow/core/profiler/utils/xplane_utils.h"
32 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
33 
34 namespace tensorflow {
35 namespace profiler {
36 namespace {
37 
38 struct CorrelationInfo {
39   uint32_t thread_id;
40   uint64_t enqueue_time_ns;
41 };
42 
43 enum GpuEventType {
44   kUnknown,
45   kKernel,
46   kMemcpyH2D,
47   kMemcpyD2H,
48   kMemcpyD2D,
49   kMemcpyP2P,
50 };
51 
ParseMemcpyName(absl::string_view memcpy_name)52 GpuEventType ParseMemcpyName(absl::string_view memcpy_name) {
53   if (absl::ConsumePrefix(&memcpy_name, "Memcpy")) {
54     if (memcpy_name == "H2D") return GpuEventType::kMemcpyH2D;
55     if (memcpy_name == "D2H") return GpuEventType::kMemcpyD2H;
56     if (memcpy_name == "D2D") return GpuEventType::kMemcpyD2D;
57     if (memcpy_name == "P2P") return GpuEventType::kMemcpyP2P;
58   }
59   return GpuEventType::kUnknown;
60 }
61 
SetNodeTimes(const XEventVisitor & event,NodeExecStats * ns)62 void SetNodeTimes(const XEventVisitor& event, NodeExecStats* ns) {
63   ns->set_all_start_micros(NanosToMicros(event.TimestampNs()));
64   ns->set_op_start_rel_micros(0);
65   ns->set_op_end_rel_micros(NanosToMicros(event.DurationNs()));
66   ns->set_all_end_rel_micros(NanosToMicros(event.DurationNs()));
67 }
68 
69 }  // namespace
70 
ConvertGpuXSpaceToStepStats(const XSpace & xspace,StepStats * step_stats)71 void ConvertGpuXSpaceToStepStats(const XSpace& xspace, StepStats* step_stats) {
72   std::vector<const XPlane*> device_planes =
73       FindPlanesWithPrefix(xspace, kGpuPlanePrefix);
74   if (device_planes.empty()) {
75     LOG(WARNING) << "GPU trace was not collected.";
76     return;
77   }
78   std::vector<const XPlane*> host_planes = FindPlanesWithNames(
79       xspace, {kCuptiDriverApiPlaneName, kRoctracerApiPlaneName});
80   DCHECK_LE(host_planes.size(), 1);
81 
82   absl::flat_hash_map<int64_t /*correlation_id*/, CorrelationInfo>
83       correlation_info_map;
84   for (const XPlane* host_plane : host_planes) {
85     absl::flat_hash_map<uint32_t /*device_id*/, DeviceStepStats*>
86         sync_dev_stats_map;
87     XPlaneVisitor plane = CreateTfXPlaneVisitor(host_plane);
88     plane.ForEachLine([&](const XLineVisitor& line) {
89       uint32_t thread_id = line.Id();
90       line.ForEachEvent([&](const XEventVisitor& event) {
91         if (event.Name() == "cuStreamSynchronize") {
92           auto device_id_stat = event.GetStat(StatType::kDeviceId);
93           if (device_id_stat.has_value()) {
94             uint32_t device_ordinal = device_id_stat->IntOrUintValue();
95             DeviceStepStats* sync_dev_stats =
96                 sync_dev_stats_map[device_ordinal];
97             if (sync_dev_stats == nullptr) {
98               sync_dev_stats = step_stats->add_dev_stats();
99               sync_dev_stats->set_device(
100                   absl::StrCat("/device:GPU:", device_ordinal, "/sync"));
101             }
102             NodeExecStats* ns = sync_dev_stats->add_node_stats();
103             SetNodeTimes(event, ns);
104             ns->set_node_name(std::string(event.Name()));
105             ns->set_timeline_label(absl::StrCat("ThreadId ", thread_id));
106             ns->set_thread_id(thread_id);
107           }
108         } else {
109           auto correlation_id_stat = event.GetStat(StatType::kCorrelationId);
110           if (correlation_id_stat.has_value()) {
111             int64_t correlation_id = correlation_id_stat->IntValue();
112             uint64_t enqueue_time_ns = event.TimestampNs();
113             correlation_info_map[correlation_id] = {thread_id, enqueue_time_ns};
114           }
115         }
116       });
117     });
118   }
119   for (const XPlane* device_plane : device_planes) {
120     absl::flat_hash_map<std::pair<int64_t /*stream_id*/, GpuEventType>,
121                         DeviceStepStats*>
122         stream_dev_stats_map;
123     DeviceStepStats* unknown_stream_dev_stats = nullptr;
124     DeviceStepStats* all_streams_dev_stats = nullptr;
125     DeviceStepStats* memcpy_dev_stats = nullptr;
126     XPlaneVisitor plane = CreateTfXPlaneVisitor(device_plane);
127     uint32_t device_ordinal = plane.Id();
128     plane.ForEachLine([&](const XLineVisitor& line) {
129       uint32_t stream_id = line.Id();
130       line.ForEachEvent([&](const XEventVisitor& event) {
131         int64_t correlation_id = -1;
132         absl::string_view annotation;
133         absl::string_view kernel_details;
134         absl::string_view memcpy_details;
135         event.ForEachStat([&](const XStatVisitor& stat) {
136           if (!stat.Type().has_value()) return;
137           switch (stat.Type().value()) {
138             case StatType::kCorrelationId:
139               correlation_id = stat.IntValue();
140               break;
141             case StatType::kKernelAnnotation:
142               annotation = stat.StrOrRefValue();
143               break;
144             case StatType::kKernelDetails:
145               kernel_details = stat.StrOrRefValue();
146               break;
147             case StatType::kMemcpyDetails:
148               memcpy_details = stat.StrOrRefValue();
149               break;
150             default:
151               break;
152           }
153         });
154 
155         auto ns = absl::make_unique<NodeExecStats>();
156         SetNodeTimes(event, ns.get());
157 
158         // Get launch information if available.
159         if (correlation_id > 0) {
160           auto it = correlation_info_map.find(correlation_id);
161           if (it != correlation_info_map.end()) {
162             const CorrelationInfo& correlation_info = it->second;
163             ns->set_scheduled_micros(
164                 NanosToMicros(correlation_info.enqueue_time_ns));
165             ns->set_thread_id(correlation_info.thread_id);
166           }
167         }
168 
169         absl::string_view node_name = event.Name();
170         if (!annotation.empty()) {
171           auto annotation_stack = ParseAnnotationStack(annotation);
172           if (!annotation_stack.empty()) {
173             node_name = annotation_stack.back().name;
174           }
175         }
176         ns->set_node_name(std::string(node_name));
177 
178         if (!kernel_details.empty()) {
179           absl::string_view kernel_name = event.Name();
180           ns->set_timeline_label(
181               absl::StrCat(kernel_name, " ", kernel_details));
182           DeviceStepStats*& stream_dev_stats =
183               stream_dev_stats_map[{stream_id, GpuEventType::kKernel}];
184           if (stream_dev_stats == nullptr) {
185             stream_dev_stats = step_stats->add_dev_stats();
186             stream_dev_stats->set_device(absl::StrCat(
187                 "/device:GPU:", device_ordinal, "/stream:", stream_id));
188           }
189           *stream_dev_stats->add_node_stats() = *ns;
190           if (all_streams_dev_stats == nullptr) {
191             all_streams_dev_stats = step_stats->add_dev_stats();
192             all_streams_dev_stats->set_device(
193                 absl::StrCat("/device:GPU:", device_ordinal, "/stream:all"));
194           }
195           all_streams_dev_stats->add_node_stats()->Swap(ns.get());
196 
197         } else if (!memcpy_details.empty()) {
198           absl::string_view memcpy_name = event.Name();
199           ns->set_timeline_label(
200               absl::StrCat(memcpy_name, " ", memcpy_details));
201           GpuEventType gpu_event_type = ParseMemcpyName(memcpy_name);
202           DCHECK_NE(gpu_event_type, GpuEventType::kUnknown);
203           DeviceStepStats*& stream_dev_stats =
204               stream_dev_stats_map[{stream_id, gpu_event_type}];
205           if (stream_dev_stats == nullptr) {
206             stream_dev_stats = step_stats->add_dev_stats();
207             stream_dev_stats->set_device(
208                 absl::StrCat("/device:GPU:", device_ordinal,
209                              "/stream:", stream_id, "<", memcpy_name, ">"));
210           }
211           *stream_dev_stats->add_node_stats() = *ns;
212           if (memcpy_dev_stats == nullptr) {
213             memcpy_dev_stats = step_stats->add_dev_stats();
214             memcpy_dev_stats->set_device(
215                 absl::StrCat("/device:GPU:", device_ordinal, "/memcpy"));
216           }
217           memcpy_dev_stats->add_node_stats()->Swap(ns.get());
218 
219         } else {
220           ns->set_timeline_label(std::string(node_name));
221           if (unknown_stream_dev_stats == nullptr) {
222             unknown_stream_dev_stats = step_stats->add_dev_stats();
223             unknown_stream_dev_stats->set_device(
224                 absl::StrCat("/device:GPU:", device_ordinal, "/stream:"));
225           }
226           unknown_stream_dev_stats->add_node_stats()->Swap(ns.get());
227         }
228       });
229     });
230   }
231 }
232 
233 }  // namespace profiler
234 }  // namespace tensorflow
235