1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/profiler/convert/xplane_to_op_stats.h"
17
18 #include <string>
19 #include <vector>
20
21 #include "absl/container/flat_hash_map.h"
22 #include "absl/container/flat_hash_set.h"
23 #include "tensorflow/core/platform/env.h"
24 #include "tensorflow/core/platform/types.h"
25 #include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
26 #include "tensorflow/core/profiler/convert/op_stats_combiner.h"
27 #include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
28 #include "tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h"
29 #include "tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h"
30 #include "tensorflow/core/profiler/convert/xplane_to_step_events.h"
31 #include "tensorflow/core/profiler/convert/xplane_to_tf_functions.h"
32 #include "tensorflow/core/profiler/protobuf/diagnostics.pb.h"
33 #include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
34 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
35 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
36 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
37 #include "tensorflow/core/profiler/protobuf/tf_function.pb.h"
38 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
39 #include "tensorflow/core/profiler/utils/event_span.h"
40 #include "tensorflow/core/profiler/utils/hardware_type_utils.h"
41 #include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
42 #include "tensorflow/core/profiler/utils/step_intersection.h"
43 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
44 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
45 #include "tensorflow/core/profiler/utils/xplane_schema.h"
46 #include "tensorflow/core/profiler/utils/xplane_utils.h"
47 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
48
49 namespace tensorflow {
50 namespace profiler {
51
GetDeviceCapFromXPlane(const XPlane & device_plane)52 DeviceCapabilities GetDeviceCapFromXPlane(const XPlane& device_plane) {
53 DeviceCapabilities cap;
54 XPlaneVisitor plane = CreateTfXPlaneVisitor(&device_plane);
55 plane.ForEachStat([&cap](const XStatVisitor& stat) {
56 if (!stat.Type().has_value()) return;
57 switch (stat.Type().value()) {
58 case kDevCapClockRateKHz:
59 cap.set_clock_rate_in_ghz(stat.IntValue() / 1000000.0);
60 break;
61 case kDevCapCoreCount:
62 cap.set_num_cores(stat.IntValue());
63 break;
64 case kDevCapMemoryBandwidth:
65 cap.set_memory_bandwidth(stat.UintValue()); // bytes/s
66 break;
67 case kDevCapMemorySize:
68 cap.set_memory_size_in_bytes(stat.UintValue());
69 break;
70 case kDevCapComputeCapMajor:
71 cap.mutable_compute_capability()->set_major(stat.IntValue());
72 break;
73 case kDevCapComputeCapMinor:
74 cap.mutable_compute_capability()->set_minor(stat.IntValue());
75 break;
76 }
77 });
78 return cap;
79 }
80
MakePerfEnv(double peak_tera_flops_per_second,double peak_hbm_bw_giga_bytes_per_second)81 PerfEnv MakePerfEnv(double peak_tera_flops_per_second,
82 double peak_hbm_bw_giga_bytes_per_second) {
83 PerfEnv result;
84 result.set_peak_tera_flops_per_second(peak_tera_flops_per_second);
85 result.set_peak_hbm_bw_giga_bytes_per_second(
86 peak_hbm_bw_giga_bytes_per_second);
87 result.set_ridge_point(peak_tera_flops_per_second * 1000 /
88 peak_hbm_bw_giga_bytes_per_second);
89 return result;
90 }
91
GetPerfEnvFromXPlane(const XPlane & device_plane)92 PerfEnv GetPerfEnvFromXPlane(const XPlane& device_plane) {
93 DeviceCapabilities cap = GetDeviceCapFromXPlane(device_plane);
94 return MakePerfEnv(GetFlopMaxThroughputPerSM(cap) / 1000 * cap.num_cores(),
95 cap.memory_bandwidth() / 1e9);
96 }
97
98 namespace {
99
SetRunEnvironment(const XSpace & space,int32_t accelerator_count,RunEnvironment * env)100 void SetRunEnvironment(const XSpace& space, int32_t accelerator_count,
101 RunEnvironment* env) {
102 // Currently, we only support profiling one host and one program.
103 env->set_host_count(1);
104 env->set_task_count(1);
105 for (const auto& hostname : space.hostnames()) {
106 std::vector<std::string> hostname_split = absl::StrSplit(hostname, ':');
107 (*env->mutable_hostnames())[hostname_split[0]] = true;
108 }
109 env->set_device_type(accelerator_count > 0 ? "GPU" : "CPU");
110 env->set_device_core_count(accelerator_count);
111 }
112
113 } // namespace
114
PropagateXSpaceDiagnosticsToOpStats(const XSpace & space,OpStats * op_stats)115 void PropagateXSpaceDiagnosticsToOpStats(const XSpace& space,
116 OpStats* op_stats) {
117 if (!space.errors().empty()) {
118 absl::flat_hash_set<std::string> unique_errors;
119 unique_errors.insert(space.errors().begin(), space.errors().end());
120 *op_stats->mutable_diagnostics()->mutable_errors() = {unique_errors.begin(),
121 unique_errors.end()};
122 }
123 if (!space.warnings().empty()) {
124 absl::flat_hash_set<std::string> unique_warnings;
125 unique_warnings.insert(space.warnings().begin(), space.warnings().end());
126 *op_stats->mutable_diagnostics()->mutable_warnings() = {
127 unique_warnings.begin(), unique_warnings.end()};
128 }
129 }
130
ConvertXSpaceToOpStats(const XSpace & space,const OpStatsOptions & options)131 OpStats ConvertXSpaceToOpStats(const XSpace& space,
132 const OpStatsOptions& options) {
133 const XPlane* host_plane = FindPlaneWithName(space, kHostThreadsPlaneName);
134 std::vector<const XPlane*> device_planes =
135 FindPlanesWithPrefix(space, kGpuPlanePrefix);
136 OpStats op_stats;
137 StepEvents step_events;
138 PropagateXSpaceDiagnosticsToOpStats(space, &op_stats);
139 // Convert device planes.
140 OpMetricsDbCombiner op_metrics_db_combiner(
141 op_stats.mutable_device_op_metrics_db());
142 SetRunEnvironment(space, device_planes.size(),
143 op_stats.mutable_run_environment());
144
145 KernelReportMap reports;
146 absl::string_view gpu_model = "";
147
148 // TODO(b/161942993) parallelize XPlane processing per thread.
149 for (const XPlane* device_trace : device_planes) {
150 if (options.generate_op_metrics_db) {
151 if (!op_stats.has_perf_env()) {
152 *op_stats.mutable_perf_env() = GetPerfEnvFromXPlane(*device_trace);
153 }
154 OpMetricsDb device_op_metrics_db =
155 ConvertDeviceTraceXPlaneToOpMetricsDb(*device_trace);
156 op_metrics_db_combiner.Combine(device_op_metrics_db);
157 }
158 if (gpu_model.empty()) {
159 gpu_model = GpuModelName(GetDeviceCapFromXPlane(*device_trace));
160 }
161 if (options.generate_step_db) {
162 StepEvents device_step_events =
163 ConvertDeviceTraceXPlaneToStepEvents(*device_trace);
164 CombineStepEvents(device_step_events, &step_events);
165 }
166 if (options.generate_kernel_stats_db) {
167 ConvertDeviceTraceXPlaneToKernelReports(*device_trace,
168 /*on_kernel_fn=*/{}, &reports);
169 }
170 }
171
172 if (!gpu_model.empty()) {
173 // Overwrites the device type with the more specific GPU model name.
174 op_stats.mutable_run_environment()->set_device_type(std::string(gpu_model));
175 }
176
177 // Combine into reports.
178 if (options.generate_kernel_stats_db) {
179 CopyTopKDurationKernelReportsToDb(reports,
180 op_stats.mutable_kernel_stats_db());
181 }
182
183 bool has_device = !device_planes.empty();
184 // Convert a host plane.
185 if (host_plane) {
186 if (options.generate_op_metrics_db) {
187 *op_stats.mutable_host_op_metrics_db() =
188 ConvertHostThreadsXPlaneToOpMetricsDb(*host_plane);
189 }
190 if (options.generate_step_db) {
191 const StepEvents* device_step_events =
192 has_device ? &step_events : nullptr;
193 StepEvents host_step_events =
194 ConvertHostThreadsXPlaneToStepEvents(*host_plane, device_step_events);
195 CombineStepEvents(host_step_events, &step_events);
196 }
197 }
198 if (options.generate_step_db) {
199 StepEvents nonoverlapped_step_events =
200 ToNonOverlappedStepEvents(step_events);
201 *op_stats.mutable_step_db() = ConvertStepEventsToStepDb(
202 has_device, options.maybe_drop_incomplete_steps,
203 nonoverlapped_step_events);
204 *op_stats.mutable_device_op_metrics_db()->mutable_precision_stats() =
205 ComputePrecisionStats(nonoverlapped_step_events);
206 }
207
208 CoreDetails& details =
209 (*op_stats.mutable_core_id_to_details())[kDefaultGpuLocalCoreId];
210 details.set_hostname(space.hostnames().empty() ? "localhost"
211 : space.hostnames(0));
212 return op_stats;
213 }
214
ConvertMultiXSpacesToCombinedOpStats(const std::vector<XSpace> & xspaces,const OpStatsOptions & options,OpStats * combined_op_stats)215 Status ConvertMultiXSpacesToCombinedOpStats(const std::vector<XSpace>& xspaces,
216 const OpStatsOptions& options,
217 OpStats* combined_op_stats) {
218 // A shortcut code path for a single XSpace. There is no need to merge OpStats
219 // if there is only a single XSpace.
220 if (xspaces.size() == 1) {
221 *combined_op_stats = ConvertXSpaceToOpStats(xspaces[0], options);
222 return Status::OK();
223 }
224
225 // Read multiple XSpaces and convert to multiple OpStats.
226 std::vector<OpStats> all_op_stats;
227 all_op_stats.reserve(xspaces.size());
228 for (const XSpace& xspace : xspaces) {
229 all_op_stats.push_back(ConvertXSpaceToOpStats(xspace, options));
230 }
231
232 // Combine OpStats.
233 std::vector<OpStatsInfo> all_op_stats_info;
234 all_op_stats_info.reserve(all_op_stats.size());
235 for (int i = 0; i < all_op_stats.size(); i++) {
236 all_op_stats_info.emplace_back(
237 &all_op_stats[i],
238 ParseHardwareType(all_op_stats[i].run_environment().device_type()), i);
239 }
240
241 // Do not limit the maximum number of steps during the merge of OpStats.
242 StepIntersection step_intersection =
243 ComputeStepIntersectionToMergeOpStats(all_op_stats_info, kuint32max);
244 CombineAllOpStats(all_op_stats_info, step_intersection, combined_op_stats);
245
246 return Status::OK();
247 }
248
249 } // namespace profiler
250 } // namespace tensorflow
251