1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
16 
17 #include <sstream>
18 #include <utility>
19 #include <vector>
20 
21 #include "google/protobuf/any.pb.h"
22 #include "absl/algorithm/container.h"
23 #include "absl/container/flat_hash_map.h"
24 #include "tensorflow/core/lib/gtl/map_util.h"
25 #include "tensorflow/core/platform/logging.h"
26 #include "tensorflow/core/platform/types.h"
27 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
28 #include "tensorflow/core/profiler/utils/event_span.h"
29 #include "tensorflow/core/profiler/utils/timespan.h"
30 
31 namespace tensorflow {
32 namespace profiler {
33 
34 // Local core id should start from 1.
35 const uint32 kDefaultGpuLocalCoreId = 1;
36 
37 namespace {
38 
39 // Converts from StepDetails to StepInfoResult.
ConvertStepDetailsToStepInfo(bool has_device,int64_t step_num,const StepDetails & step_details)40 StepInfoResult ConvertStepDetailsToStepInfo(bool has_device, int64_t step_num,
41                                             const StepDetails& step_details) {
42   GenericStepBreakdown generic;
43   Timespan step_time = step_details.StepTime();
44   auto& type_ps = *(generic.mutable_type_ps());
45   uint64 total_event_duration = 0;
46   for (const auto& event : step_details.Events()) {
47     // Ignore event duration outside the step marker.
48     uint64 event_duration = step_time.OverlappedDurationPs(event.span);
49     type_ps[event.type] += event_duration;
50     total_event_duration += event_duration;
51   }
52   if (total_event_duration < step_time.duration_ps()) {
53     // Some time in the step is not associated with any event. Classify them as
54     // "unknown time".
55     type_ps[UNKNOWN_TIME] += step_time.duration_ps() - total_event_duration;
56   }
57   // Determines if this particular step is a well-formed one.
58   bool well_formed_step = has_device ? (type_ps.contains(DEVICE_COMPUTE_16) ||
59                                         type_ps.contains(DEVICE_COMPUTE_32))
60                                      : type_ps.contains(HOST_COMPUTE);
61   StepInfoResult step_info;
62   step_info.mutable_step_breakdown()->PackFrom(generic);
63   if (well_formed_step) {
64     step_info.set_step_num(step_num);
65     step_info.set_step_name(step_details.StepName());
66     step_info.set_begin_ps(step_time.begin_ps());
67     step_info.set_duration_ps(step_time.duration_ps());
68   } else {
69     // For a non-well-formed step, sets its duration to 0 so that it will be
70     // ignored by the caller of this function.
71     step_info.set_duration_ps(0);
72   }
73   return step_info;
74 }
75 
DebugGenericStepBreakdown(const GenericStepBreakdown & generic)76 string DebugGenericStepBreakdown(const GenericStepBreakdown& generic) {
77   std::ostringstream out;
78   uint64 total_ps = 0;
79   const auto& type_ps_map = generic.type_ps();
80   for (const auto& type_ps : type_ps_map) {
81     total_ps += type_ps.second;
82   }
83   out << "Total ps = " << total_ps << std::endl;
84   for (int type = LAST_EVENT_TYPE; type >= 0; --type) {
85     const auto* ps = gtl::FindOrNull(type_ps_map, type);
86     if (ps == nullptr) continue;
87     double percent = (*ps * 100.0) / total_ps;
88     auto event_type = static_cast<EventType>(type);
89     out << PrintEventType(event_type) << ": " << percent << "%"
90         << ", ps = " << *ps << std::endl;
91   }
92   return out.str();
93 }
94 
DebugStepInfo(const StepInfoResult & step_info)95 string DebugStepInfo(const StepInfoResult& step_info) {
96   std::ostringstream out;
97   out << "step_num=" << step_info.step_num()
98       << ", duration_ps=" << step_info.duration_ps()
99       << ", begin_ps=" << step_info.begin_ps() << std::endl;
100   GenericStepBreakdown generic;
101   if (step_info.step_breakdown().UnpackTo(&generic)) {
102     out << "Generic step breakdown:" << std::endl;
103     out << DebugGenericStepBreakdown(generic) << std::endl;
104   } else {
105     out << step_info.step_breakdown().DebugString() << std::endl;
106   }
107   return out.str();
108 }
109 
110 }  // namespace
111 
ConvertStepEventsToStepDb(bool has_device,bool maybe_drop_incomplete_steps,const StepEvents & nonoverlapped_step_events)112 StepDatabaseResult ConvertStepEventsToStepDb(
113     bool has_device, bool maybe_drop_incomplete_steps,
114     const StepEvents& nonoverlapped_step_events) {
115   StepDatabaseResult step_db;
116   // Gets sorted step numbers.
117   std::vector<int64_t> step_numbers;
118   step_numbers.reserve(nonoverlapped_step_events.size());
119   for (const auto& step_events : nonoverlapped_step_events) {
120     step_numbers.push_back(step_events.first);
121   }
122   absl::c_sort(step_numbers);
123   for (const auto& step : step_numbers) {
124     const auto* step_details = gtl::FindOrNull(nonoverlapped_step_events, step);
125     if (step_details == nullptr) continue;
126     StepInfoResult step_info =
127         ConvertStepDetailsToStepInfo(has_device, step, *step_details);
128     if (step_info.duration_ps() == 0)
129       continue;  // Do not include non-well-formed steps.
130     PerCoreStepInfo per_core_step_info;
131     per_core_step_info.set_step_num(step);
132     // When we generated StepEvents, we already put events from all device
133     // cores and cpu threads on this host into a single event stream, therefore
134     // we can't separate them anymore. Simply assigns all events to Core-0.
135     (*per_core_step_info.mutable_step_info_per_core())[kDefaultGpuLocalCoreId] =
136         std::move(step_info);
137     VLOG(2) << std::endl
138             << "step_id: " << step << ", step_info:" << std::endl
139             << DebugStepInfo((
140                    *per_core_step_info
141                         .mutable_step_info_per_core())[kDefaultGpuLocalCoreId]);
142     // Populates the collective ops information.
143     auto& collectives = *per_core_step_info.mutable_all_reduce_db_per_core();
144     for (const auto& it : step_details->Collectives()) {
145       collectives[it.first] = it.second;
146     }
147     // Populates the device transfer stats for this step.
148     auto& device_memory_transfers =
149         *per_core_step_info.mutable_device_memory_transfers();
150     for (const auto& dma : step_details->DeviceMemoryTransfers()) {
151       *device_memory_transfers.Add() = dma;
152     }
153     // The remaining fields in PerCoreStepInfo are not filled.
154     *step_db.add_step_sequence() = per_core_step_info;
155   }
156 
157   // If we are using sampling mode and we get enough steps, we would like to
158   // drop the incomplete steps at the beginning and the end.
159   // (Sometimes CUTPI instrumentation will prolong the first step too).
160   int kDropIncomplteteStepThreshold = 5;
161   if (maybe_drop_incomplete_steps &&
162       step_db.step_sequence_size() > kDropIncomplteteStepThreshold) {
163     step_db.mutable_step_sequence()->erase(
164         step_db.mutable_step_sequence()->begin());
165     step_db.mutable_step_sequence()->RemoveLast();
166   }
167   return step_db;
168 }
169 
170 }  // namespace profiler
171 }  // namespace tensorflow
172