• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/profiler/convert/xplane_to_op_stats.h"
17 
18 #include <string>
19 #include <vector>
20 
21 #include "absl/container/flat_hash_map.h"
22 #include "absl/container/flat_hash_set.h"
23 #include "tensorflow/core/platform/env.h"
24 #include "tensorflow/core/platform/types.h"
25 #include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
26 #include "tensorflow/core/profiler/convert/op_stats_combiner.h"
27 #include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
28 #include "tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h"
29 #include "tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h"
30 #include "tensorflow/core/profiler/convert/xplane_to_step_events.h"
31 #include "tensorflow/core/profiler/convert/xplane_to_tf_functions.h"
32 #include "tensorflow/core/profiler/protobuf/diagnostics.pb.h"
33 #include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
34 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
35 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
36 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
37 #include "tensorflow/core/profiler/protobuf/tf_function.pb.h"
38 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
39 #include "tensorflow/core/profiler/utils/event_span.h"
40 #include "tensorflow/core/profiler/utils/hardware_type_utils.h"
41 #include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
42 #include "tensorflow/core/profiler/utils/step_intersection.h"
43 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
44 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
45 #include "tensorflow/core/profiler/utils/xplane_schema.h"
46 #include "tensorflow/core/profiler/utils/xplane_utils.h"
47 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
48 
49 namespace tensorflow {
50 namespace profiler {
51 
GetDeviceCapFromXPlane(const XPlane & device_plane)52 DeviceCapabilities GetDeviceCapFromXPlane(const XPlane& device_plane) {
53   DeviceCapabilities cap;
54   XPlaneVisitor plane = CreateTfXPlaneVisitor(&device_plane);
55   plane.ForEachStat([&cap](const XStatVisitor& stat) {
56     if (!stat.Type().has_value()) return;
57     switch (stat.Type().value()) {
58       case kDevCapClockRateKHz:
59         cap.set_clock_rate_in_ghz(stat.IntValue() / 1000000.0);
60         break;
61       case kDevCapCoreCount:
62         cap.set_num_cores(stat.IntValue());
63         break;
64       case kDevCapMemoryBandwidth:
65         cap.set_memory_bandwidth(stat.UintValue());  // bytes/s
66         break;
67       case kDevCapMemorySize:
68         cap.set_memory_size_in_bytes(stat.UintValue());
69         break;
70       case kDevCapComputeCapMajor:
71         cap.mutable_compute_capability()->set_major(stat.IntValue());
72         break;
73       case kDevCapComputeCapMinor:
74         cap.mutable_compute_capability()->set_minor(stat.IntValue());
75         break;
76     }
77   });
78   return cap;
79 }
80 
MakePerfEnv(double peak_tera_flops_per_second,double peak_hbm_bw_giga_bytes_per_second)81 PerfEnv MakePerfEnv(double peak_tera_flops_per_second,
82                     double peak_hbm_bw_giga_bytes_per_second) {
83   PerfEnv result;
84   result.set_peak_tera_flops_per_second(peak_tera_flops_per_second);
85   result.set_peak_hbm_bw_giga_bytes_per_second(
86       peak_hbm_bw_giga_bytes_per_second);
87   result.set_ridge_point(peak_tera_flops_per_second * 1000 /
88                          peak_hbm_bw_giga_bytes_per_second);
89   return result;
90 }
91 
GetPerfEnvFromXPlane(const XPlane & device_plane)92 PerfEnv GetPerfEnvFromXPlane(const XPlane& device_plane) {
93   DeviceCapabilities cap = GetDeviceCapFromXPlane(device_plane);
94   return MakePerfEnv(GetFlopMaxThroughputPerSM(cap) / 1000 * cap.num_cores(),
95                      cap.memory_bandwidth() / 1e9);
96 }
97 
98 namespace {
99 
SetRunEnvironment(const XSpace & space,int32_t accelerator_count,RunEnvironment * env)100 void SetRunEnvironment(const XSpace& space, int32_t accelerator_count,
101                        RunEnvironment* env) {
102   // Currently, we only support profiling one host and one program.
103   env->set_host_count(1);
104   env->set_task_count(1);
105   for (const auto& hostname : space.hostnames()) {
106     std::vector<std::string> hostname_split = absl::StrSplit(hostname, ':');
107     (*env->mutable_hostnames())[hostname_split[0]] = true;
108   }
109   env->set_device_type(accelerator_count > 0 ? "GPU" : "CPU");
110   env->set_device_core_count(accelerator_count);
111 }
112 
113 }  // namespace
114 
PropagateXSpaceDiagnosticsToOpStats(const XSpace & space,OpStats * op_stats)115 void PropagateXSpaceDiagnosticsToOpStats(const XSpace& space,
116                                          OpStats* op_stats) {
117   if (!space.errors().empty()) {
118     absl::flat_hash_set<std::string> unique_errors;
119     unique_errors.insert(space.errors().begin(), space.errors().end());
120     *op_stats->mutable_diagnostics()->mutable_errors() = {unique_errors.begin(),
121                                                           unique_errors.end()};
122   }
123   if (!space.warnings().empty()) {
124     absl::flat_hash_set<std::string> unique_warnings;
125     unique_warnings.insert(space.warnings().begin(), space.warnings().end());
126     *op_stats->mutable_diagnostics()->mutable_warnings() = {
127         unique_warnings.begin(), unique_warnings.end()};
128   }
129 }
130 
ConvertXSpaceToOpStats(const XSpace & space,const OpStatsOptions & options)131 OpStats ConvertXSpaceToOpStats(const XSpace& space,
132                                const OpStatsOptions& options) {
133   const XPlane* host_plane = FindPlaneWithName(space, kHostThreadsPlaneName);
134   std::vector<const XPlane*> device_planes =
135       FindPlanesWithPrefix(space, kGpuPlanePrefix);
136   OpStats op_stats;
137   StepEvents step_events;
138   PropagateXSpaceDiagnosticsToOpStats(space, &op_stats);
139   // Convert device planes.
140   OpMetricsDbCombiner op_metrics_db_combiner(
141       op_stats.mutable_device_op_metrics_db());
142   SetRunEnvironment(space, device_planes.size(),
143                     op_stats.mutable_run_environment());
144 
145   KernelReportMap reports;
146   absl::string_view gpu_model = "";
147 
148   // TODO(b/161942993) parallelize XPlane processing per thread.
149   for (const XPlane* device_trace : device_planes) {
150     if (options.generate_op_metrics_db) {
151       if (!op_stats.has_perf_env()) {
152         *op_stats.mutable_perf_env() = GetPerfEnvFromXPlane(*device_trace);
153       }
154       OpMetricsDb device_op_metrics_db =
155           ConvertDeviceTraceXPlaneToOpMetricsDb(*device_trace);
156       op_metrics_db_combiner.Combine(device_op_metrics_db);
157     }
158     if (gpu_model.empty()) {
159       gpu_model = GpuModelName(GetDeviceCapFromXPlane(*device_trace));
160     }
161     if (options.generate_step_db) {
162       StepEvents device_step_events =
163           ConvertDeviceTraceXPlaneToStepEvents(*device_trace);
164       CombineStepEvents(device_step_events, &step_events);
165     }
166     if (options.generate_kernel_stats_db) {
167       ConvertDeviceTraceXPlaneToKernelReports(*device_trace,
168                                               /*on_kernel_fn=*/{}, &reports);
169     }
170   }
171 
172   if (!gpu_model.empty()) {
173     // Overwrites the device type with the more specific GPU model name.
174     op_stats.mutable_run_environment()->set_device_type(std::string(gpu_model));
175   }
176 
177   // Combine into reports.
178   if (options.generate_kernel_stats_db) {
179     CopyTopKDurationKernelReportsToDb(reports,
180                                       op_stats.mutable_kernel_stats_db());
181   }
182 
183   bool has_device = !device_planes.empty();
184   // Convert a host plane.
185   if (host_plane) {
186     if (options.generate_op_metrics_db) {
187       *op_stats.mutable_host_op_metrics_db() =
188           ConvertHostThreadsXPlaneToOpMetricsDb(*host_plane);
189     }
190     if (options.generate_step_db) {
191       const StepEvents* device_step_events =
192           has_device ? &step_events : nullptr;
193       StepEvents host_step_events =
194           ConvertHostThreadsXPlaneToStepEvents(*host_plane, device_step_events);
195       CombineStepEvents(host_step_events, &step_events);
196     }
197   }
198   if (options.generate_step_db) {
199     StepEvents nonoverlapped_step_events =
200         ToNonOverlappedStepEvents(step_events);
201     *op_stats.mutable_step_db() = ConvertStepEventsToStepDb(
202         has_device, options.maybe_drop_incomplete_steps,
203         nonoverlapped_step_events);
204     *op_stats.mutable_device_op_metrics_db()->mutable_precision_stats() =
205         ComputePrecisionStats(nonoverlapped_step_events);
206   }
207 
208   CoreDetails& details =
209       (*op_stats.mutable_core_id_to_details())[kDefaultGpuLocalCoreId];
210   details.set_hostname(space.hostnames().empty() ? "localhost"
211                                                  : space.hostnames(0));
212   return op_stats;
213 }
214 
ConvertMultiXSpacesToCombinedOpStats(const std::vector<XSpace> & xspaces,const OpStatsOptions & options,OpStats * combined_op_stats)215 Status ConvertMultiXSpacesToCombinedOpStats(const std::vector<XSpace>& xspaces,
216                                             const OpStatsOptions& options,
217                                             OpStats* combined_op_stats) {
218   // A shortcut code path for a single XSpace. There is no need to merge OpStats
219   // if there is only a single XSpace.
220   if (xspaces.size() == 1) {
221     *combined_op_stats = ConvertXSpaceToOpStats(xspaces[0], options);
222     return Status::OK();
223   }
224 
225   // Read multiple XSpaces and convert to multiple OpStats.
226   std::vector<OpStats> all_op_stats;
227   all_op_stats.reserve(xspaces.size());
228   for (const XSpace& xspace : xspaces) {
229     all_op_stats.push_back(ConvertXSpaceToOpStats(xspace, options));
230   }
231 
232   // Combine OpStats.
233   std::vector<OpStatsInfo> all_op_stats_info;
234   all_op_stats_info.reserve(all_op_stats.size());
235   for (int i = 0; i < all_op_stats.size(); i++) {
236     all_op_stats_info.emplace_back(
237         &all_op_stats[i],
238         ParseHardwareType(all_op_stats[i].run_environment().device_type()), i);
239   }
240 
241   // Do not limit the maximum number of steps during the merge of OpStats.
242   StepIntersection step_intersection =
243       ComputeStepIntersectionToMergeOpStats(all_op_stats_info, kuint32max);
244   CombineAllOpStats(all_op_stats_info, step_intersection, combined_op_stats);
245 
246   return Status::OK();
247 }
248 
249 }  // namespace profiler
250 }  // namespace tensorflow
251