1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/util/stat_summarizer.h"
17
18 #include <iomanip>
19 #include <map>
20 #include <queue>
21 #include <sstream>
22 #include <string>
23
24 #include "tensorflow/core/framework/step_stats.pb.h"
25 #include "tensorflow/core/framework/tensor_description.pb.h"
26 #include "tensorflow/core/framework/tensor_shape.pb.h"
27 #include "tensorflow/core/lib/strings/str_util.h"
28 #include "tensorflow/core/platform/env.h"
29 #include "tensorflow/core/platform/logging.h"
30 #include "tensorflow/core/platform/types.h"
31
32 namespace tensorflow {
33
34 using Detail = StatsCalculator::Detail;
35
StatSummarizer(const StatSummarizerOptions & options)36 StatSummarizer::StatSummarizer(const StatSummarizerOptions& options)
37 : stats_calculator_(new StatsCalculator(options)) {}
38
StatSummarizer(const tensorflow::GraphDef & tensorflow_graph)39 StatSummarizer::StatSummarizer(const tensorflow::GraphDef& tensorflow_graph)
40 : stats_calculator_(new StatsCalculator(StatSummarizerOptions())) {}
41
~StatSummarizer()42 StatSummarizer::~StatSummarizer() {}
43
Validate(const std::vector<TensorDescription> * outputs,const NodeExecStats & ns) const44 void StatSummarizer::Validate(const std::vector<TensorDescription>* outputs,
45 const NodeExecStats& ns) const {
46 if (outputs->size() != ns.output_size()) {
47 LOG(WARNING) << "Number of outputs changed between runs for '"
48 << ns.node_name() << "' - was " << outputs->size() << ", now "
49 << ns.output_size();
50 } else {
51 for (const auto& output : ns.output()) {
52 const int32 slot = output.slot();
53 if ((slot < 0) || (slot >= ns.output_size())) {
54 // This is not a hard error for Switch ops, so just pass.
55 continue;
56 }
57 const auto& stored = (*outputs)[slot];
58 const auto& current = output.tensor_description();
59
60 bool do_tensors_match =
61 (stored.dtype() == current.dtype()) &&
62 (stored.shape().dim_size() == current.shape().dim_size());
63
64 if (do_tensors_match) {
65 for (int i = 0; i < stored.shape().dim_size(); ++i) {
66 if (stored.shape().dim(i).size() != current.shape().dim(i).size()) {
67 do_tensors_match = false;
68 break;
69 }
70 }
71 }
72
73 if (!do_tensors_match) {
74 LOG(WARNING) << "Output tensor changed between runs for '"
75 << ns.node_name();
76 }
77 }
78 }
79 }
80
PrintStepStats() const81 void StatSummarizer::PrintStepStats() const {
82 string output = GetOutputString();
83 std::istringstream iss(output);
84 for (std::string line; std::getline(iss, line);) {
85 LOG(INFO) << line;
86 }
87 }
88
89 namespace {
OpType(const DeviceStepStats & ds,const NodeExecStats & ns)90 std::string OpType(const DeviceStepStats& ds, const NodeExecStats& ns) {
91 // There is no published specification of how DeviceStats and NodeStats
92 // are filled in. Thus, we live with the fragility of this implementation.
93 //
94 // Note that NodeStats.node_name may NOT refer to a node in the Graph.
95 // This can happen if, either:
96 // (1) The DeviceStats corresponds to statistics from the GPUTracer
97 // logging (which adds devices whose name contains either "/stream"
98 // or "/memcpy" to the StepStats), OR
99 // (2) The graph was partitioned, and thus the NodeStats refers to
100 // the SendTensor or RecvTensor operations added.
101 // For these cases, return "<>" as the "type" of the operation.
102 //
103 // The StatSummarizer was initially aimed at CPU execution on mobile, where
104 // there was no GPUTracing and no graph partitioning, so the conditions above
105 // do not occur.
106 //
107 // It would be nice to have a clearer spec for StepStats so utilities such as
108 // this class can handle nodes that do not appear in the original graph
109 // gracefully. Till then, duplicate what is done by:
110 // https://www.tensorflow.org/code/tensorflow/python/client/timeline.py
111 // and rely on the unittest.
112 if (ds.device().find("/stream") != std::string::npos ||
113 ds.device().find("/memcpy") != std::string::npos) {
114 // Stats from the GPUTracer, does not correspond to TensorFlow ops.
115 return "<>";
116 }
117 // timeline_label should be of the format: <node_name> = <op_type>(<args>)
118 // Extract <op_type>.
119 const std::string sep(" = ");
120 const std::string& label = ns.timeline_label();
121 std::string::size_type start = label.find(sep);
122 if (start == std::string::npos) return "<>";
123 start += sep.size();
124 std::string::size_type end = label.find('(', start);
125 if (end == std::string::npos) return "<>";
126 return label.substr(start, end - start);
127 }
128 } // namespace
129
ProcessStepStats(const StepStats & step_stats)130 void StatSummarizer::ProcessStepStats(const StepStats& step_stats) {
131 int64 curr_total_us = 0;
132 int64 mem_total = 0;
133
134 int64 first_node_start_us =
135 (step_stats.dev_stats_size() > 0 &&
136 step_stats.dev_stats(0).node_stats_size() > 0)
137 ? step_stats.dev_stats(0).node_stats(0).all_start_micros()
138 : 0;
139
140 int node_num = 0;
141 for (const auto& ds : step_stats.dev_stats()) {
142 for (const auto& ns : ds.node_stats()) {
143 // NOTE(blackhc): To better support GPUs:
144 // GPU kernels are duplicated both in /stream:all and their
145 // /stream:$index. GPU memcpys are duplicated both in /memcpy and their
146 // /stream:$index. So only keep /stream:all and /memcpy and ignore all
147 // /stream:$index to only count GPU executions once.
148 if (ds.device().find("/stream") != std::string::npos &&
149 ds.device().find("/stream:all") == std::string::npos) {
150 continue;
151 }
152 // NOTE(fishx): We will record ops execution time twice: one as CPU
153 // activity with device name "/host:CPU" and the other as TF runtime
154 // activity with device name started with "/job:*". It is safe to ignore
155 // CPU activities here.
156 // TODO(b/138729463): Read ops execution time from CPU activities instead
157 // of runtime activities.
158 if (ds.device().find("/host:CPU") != std::string::npos) {
159 continue;
160 }
161
162 std::string name = ns.node_name();
163 std::string op_type = "<>";
164 // NOTE(blackhc): we have to ensure that all keys into the detail map
165 // are unique, so we add [Kernel] or [MemCpy] as a suffix to the name.
166 // To make the node type summary work better, we prefix "gpu:" to
167 // the op type when the info is from a /gpu/stream or /memcpy channel.
168 if (ds.device().find("/stream") != std::string::npos) {
169 // node_name: name ":" opType
170 auto parts = str_util::Split(ns.node_name(), ':');
171 if (parts.size() == 2) {
172 name = parts[0] + " [Kernel]";
173 op_type = "gpu:" + parts[1];
174 }
175 } else if (ds.device().find("/memcpy") != std::string::npos) {
176 // node_name: name (":" opType)? ":" memCpyType
177 auto parts = str_util::Split(ns.node_name(), ':');
178 if (parts.size() == 2 || parts.size() == 3) {
179 name = parts.front() + " [MemCpy]";
180 // We don't care about the actual op type (it might not be available
181 // for edge_ memcpys). We only care that it's a memcpy for now.
182 op_type = "gpu:" + parts.back();
183 }
184 } else {
185 op_type = OpType(ds, ns);
186 }
187
188 ++node_num;
189 const int64 curr_time = ns.all_end_rel_micros();
190 curr_total_us += curr_time;
191 auto output_result =
192 outputs_.emplace(name, std::vector<TensorDescription>());
193 std::vector<TensorDescription>* outputs = &(output_result.first->second);
194
195 int64_t start_us = (ns.all_start_micros() - first_node_start_us);
196 int64_t rel_end_us = curr_time;
197
198 // If this is the first pass, initialize some values.
199 if (output_result.second) {
200 outputs->resize(ns.output_size());
201 for (const auto& output : ns.output()) {
202 const int32 slot = output.slot();
203 if ((slot < 0) || (slot >= ns.output_size())) {
204 // This is not a hard error for Switch ops, so just pass.
205 continue;
206 }
207 (*outputs)[slot] = output.tensor_description();
208 }
209 }
210
211 int64 curr_node_mem = 0;
212 for (const auto& mem : ns.memory()) {
213 const int64 mem_usage = mem.total_bytes();
214 curr_node_mem += mem_usage;
215 }
216 stats_calculator_->AddNodeStats(name, op_type, node_num, start_us,
217 rel_end_us, curr_node_mem);
218
219 mem_total += curr_node_mem;
220
221 Validate(outputs, ns);
222 }
223 }
224
225 stats_calculator_->UpdateRunTotalUs(curr_total_us);
226 stats_calculator_->UpdateMemoryUsed(mem_total);
227 }
228
229
PrintOutputs() const230 void StatSummarizer::PrintOutputs() const {
231 std::priority_queue<
232 std::pair<int64, const std::pair<const std::string, Detail>*>>
233 timings;
234 for (const auto& entry : stats_calculator_->GetDetails()) {
235 timings.emplace(-entry.second.start_us.avg(), &entry);
236 }
237
238 LOG(INFO) << "============ Node output tensor sizes in run order ========";
239 while (!timings.empty()) {
240 auto entry = timings.top();
241 timings.pop();
242 std::stringstream stream;
243 const auto detail_outputs = outputs_.at(entry.second->first);
244 stream << entry.second->first << "\t" << detail_outputs.size();
245 for (const auto& tensor : detail_outputs) {
246 stream << "\t" << DataTypeString(tensor.dtype());
247 stream << "\t" << tensor.shape().dim_size();
248 for (const auto& d : tensor.shape().dim()) {
249 stream << "\t" << d.size();
250 }
251 }
252 LOG(INFO) << stream.str();
253 }
254 }
255
256 } // namespace tensorflow
257