• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2016 The TensorFlow Authors All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/profiler/internal/tfprof_node.h"
17 
18 #include "tensorflow/core/profiler/internal/tfprof_utils.h"
19 
20 namespace tensorflow {
21 namespace tfprof {
CountAsAcceleratorTime(const string & device)22 bool CountAsAcceleratorTime(const string& device) {
23   return device.find("stream:all") != device.npos;
24 }
CountAsCPUTime(const string & device)25 bool CountAsCPUTime(const string& device) {
26   return RE2::FullMatch(device, ".*/(device:gpu|gpu|device:cpu|cpu):\\d+");
27 }
IsCanonicalDevice(const string & device)28 bool IsCanonicalDevice(const string& device) { return CountAsCPUTime(device); }
29 
30 // Notes about start and end time from the NodeExecStats proto:
31 // For GPU, there is no difference between op_end_rel_micros and
32 // all_end_rel_micros. All are kernel times.
33 // For CPU, op_end_rel is the kernel time, while all_end_rel_micros includes
34 // some post-processing. Besides, currently, there is no way to measure
35 // the execution time of async ops accurately.
36 //
37 // Notes about device:
38 // For ops on gpu:
39 // It will appear in three different devices in RunMetadata: 1) gpu:x,
40 // 2) gpu:x:stream:all and 3) gpu:x:stream:id. 2) is used a combined view
41 // of all different 3). 1) is the op scheduling, pre-processing and
42 // post processing time. 3) is the execution time of GPU kernels on a stream.
43 // For ops on cpu:
44 // It will only appear as cpu:0.
45 
AddTimeStats(const string & dev,const NodeExecStats & step_stat)46 void ExecStep::AddTimeStats(const string& dev, const NodeExecStats& step_stat) {
47   devices_.insert(dev);
48   if (step_stat.all_start_micros() > 0) {
49     if (exec_.all_start_micros() > 0) {
50       exec_.set_all_start_micros(
51           std::min(static_cast<int64_t>(exec_.all_start_micros()),
52                    static_cast<int64_t>(step_stat.all_start_micros())));
53     } else {
54       exec_.set_all_start_micros(step_stat.all_start_micros());
55     }
56     int64_t op_end_rel_micros = step_stat.op_end_rel_micros();
57     // Round quick execution to 1 micro to be semantically robust.
58     if (op_end_rel_micros == 0) {
59       ++op_end_rel_micros;
60     }
61     exec_.set_latest_end_micros(
62         std::max(static_cast<int64_t>(exec_.latest_end_micros()),
63                  step_stat.all_start_micros() + op_end_rel_micros));
64 
65     const std::pair<int64_t, int64_t> pair =
66         std::make_pair(step_stat.all_start_micros(), op_end_rel_micros);
67     if (CountAsAcceleratorTime(dev)) {
68       accelerator_execs_[dev].push_back(pair);
69       op_execs_[dev].push_back(pair);
70     } else if (CountAsCPUTime(dev)) {
71       cpu_execs_[dev].push_back(pair);
72       op_execs_[dev].push_back(pair);
73       // In while-loop, a graph node is executed multiple times under
74       // the same name.
75       exec_.set_run_count(exec_.run_count() + 1);
76     }
77   }
78 }
79 
AddMemoryStats(const string & dev,const NodeExecStats & step_stat)80 void ExecStep::AddMemoryStats(const string& dev,
81                               const NodeExecStats& step_stat) {
82   ExecMemory exec_mem;
83   if (step_stat.all_start_micros() > 0) {
84     exec_mem.set_memory_micros(step_stat.all_start_micros() +
85                                step_stat.op_end_rel_micros());
86   } else {
87     absl::FPrintF(stderr, "%s has no start time, skipping\n",
88                   step_stat.node_name());
89     return;
90   }
91 
92   int accelerator_allocator_cnt = 0;
93   for (const auto& mem : step_stat.memory()) {
94     // TODO(xpan): Fix this hack. Currently the allocator name seems quite
95     // ad-hoc.
96     if (mem.allocator_name().find("GPU") == mem.allocator_name().npos) {
97       continue;
98     }
99     ++accelerator_allocator_cnt;
100     exec_mem.set_allocator_bytes_in_use(
101         std::max(static_cast<int64_t>(exec_mem.allocator_bytes_in_use()),
102                  static_cast<int64_t>(mem.allocator_bytes_in_use())));
103     for (const auto& alloc : mem.allocation_records()) {
104       allocations_.push_back(alloc);
105     }
106   }
107   if (accelerator_allocator_cnt > 1) {
108     absl::FPrintF(stderr, "found %d gpu allocator for 1 node\n",
109                   accelerator_allocator_cnt);
110   }
111 
112   int64_t total_output_bytes = 0;
113   for (const auto& output : step_stat.output()) {
114     if (output.has_tensor_description() &&
115         output.tensor_description().has_allocation_description()) {
116       // TODO(xpan): Maybe allocated_bytes.
117       int64_t output_bytes = std::max(output.tensor_description()
118                                           .allocation_description()
119                                           .allocated_bytes(),
120                                       output.tensor_description()
121                                           .allocation_description()
122                                           .requested_bytes());
123       uint64 output_ptr =
124           output.tensor_description().allocation_description().ptr();
125       total_output_bytes += output_bytes;
126 
127       auto& mem = (*exec_mem.mutable_output_memory())[output.slot()];
128       mem.set_ptr(output_ptr);
129       mem.set_bytes(output_bytes);
130     }
131   }
132   exec_mem.set_output_bytes(total_output_bytes);
133 
134   if (step_stat.has_memory_stats()) {
135     if (IsPlacedOnCPU(dev)) {
136       // Currently we assume ops placed on gpu only allocate memory on gpu.
137       exec_mem.set_host_temp_bytes(exec_mem.host_temp_bytes() +
138                                    step_stat.memory_stats().temp_memory_size());
139       exec_mem.set_host_persistent_bytes(
140           exec_mem.host_persistent_bytes() +
141           step_stat.memory_stats().persistent_memory_size());
142     } else {
143       exec_mem.set_accelerator_temp_bytes(
144           exec_mem.accelerator_temp_bytes() +
145           step_stat.memory_stats().temp_memory_size());
146       exec_mem.set_accelerator_persistent_bytes(
147           exec_mem.accelerator_persistent_bytes() +
148           step_stat.memory_stats().persistent_memory_size());
149     }
150   }
151 
152   // TODO(xpan): Make this more accurate:
153   // High level: Memory tracking is suspicious and requires large scale
154   // clean up.
155   // Investigate the memory usage difference between CPU/GPU with OpViewTest.
156   //
157   // 1. OpKernelConstruction::allocate_xxx is not traced. Below, we only
158   //    discuss OpKernelContext-related allocations.
159   // 2. allocate_output calls allocate_tensor, which is properly tracked in
160   //    'NodeExecStats.memory'.
161   // 3. allocate_temp is only tracked through record_xxx_temp. It appears
162   //    in 'NodeExecStats.memory_stats'.
163   // 4. record_xxx_persistent is called when allocate_persistent
164   //    is not used and hence tracks some complementary bytes. It appears in
165   //    'NodeExecStats.memory_stats'. It's suspicious. But we should
166   //    use it now since it covers constant op.
167   int64_t residual_bytes = 0;
168   int64_t requested_bytes = 0;
169   int64_t peak_bytes = 0;
170   for (const auto& mem : step_stat.memory()) {
171     residual_bytes += mem.live_bytes();
172     requested_bytes += mem.total_bytes();
173     peak_bytes += mem.peak_bytes();
174   }
175   residual_bytes += exec_mem.host_persistent_bytes() +
176                     exec_mem.accelerator_persistent_bytes();
177   requested_bytes += exec_mem.host_persistent_bytes() +
178                      exec_mem.accelerator_persistent_bytes() +
179                      exec_mem.host_temp_bytes() +
180                      exec_mem.accelerator_temp_bytes();
181   peak_bytes += exec_mem.host_persistent_bytes() +
182                 exec_mem.accelerator_persistent_bytes() +
183                 exec_mem.host_temp_bytes() + exec_mem.accelerator_temp_bytes();
184 
185   exec_mem.set_requested_bytes(requested_bytes);
186   exec_mem.set_residual_bytes(residual_bytes);
187   exec_mem.set_peak_bytes(peak_bytes);
188   memory_execs_.emplace_back(exec_mem);
189 }
190 
AddStepStat(int64_t step,const string & device,const NodeExecStats & step_stat)191 void TFGraphNode::AddStepStat(int64_t step, const string& device,
192                               const NodeExecStats& step_stat) {
193   string dev = absl::AsciiStrToLower(device);
194 
195   // TODO(xpan): Make this more robust?
196   // See run_metadata_test.py
197   // It can be /job:0/replica:0/xxxx/device:GPU:0, or simply /device:GPU:0.
198   // It can has some ad-hoc suffix, such as /stream:xx or /memcpy:xx.
199   if (IsCanonicalDevice(dev)) {
200     if (!node_.canonical_device().empty()) {
201       if (node_.canonical_device() != dev) {
202         // TODO(xpan): Some RunMetadata node appears at multiple devices.
203         // Need to address it.
204         return;
205       }
206     } else {
207       node_.set_canonical_device(dev);
208       // TODO(xpan): Support things other than gpu?
209       node_.set_host_device(StringReplace(dev, "gpu:\\d+", "cpu:0"));
210       AddOpType(node_.canonical_device());
211     }
212   }
213 
214   auto exec = execs_.find(step);
215   if (exec == execs_.end()) {
216     execs_.insert(std::pair<int64_t, ExecStep>(step, ExecStep()));
217     exec = execs_.find(step);
218   }
219 
220   exec->second.AddTimeStats(dev, step_stat);
221 
222   if (dev == node_.canonical_device()) {
223     exec->second.AddMemoryStats(dev, step_stat);
224   }
225 }
226 
exec_micros() const227 int64_t ExecStep::exec_micros() const {
228   return accelerator_exec_micros() + cpu_exec_micros();
229 }
230 
accelerator_exec_micros() const231 int64_t ExecStep::accelerator_exec_micros() const {
232   int64_t total = 0;
233   // Normally, an op should only be scheduled on 1 accelerator device.
234   // Hence there should generally be 1 element in accelerator_execs_.
235   for (const auto& execs : accelerator_execs_) {
236     // An op can fire multiple kernels or
237     // being scheduled multiple times in while-loop.
238     for (const auto& exec : execs.second) {
239       total += exec.second;
240     }
241   }
242   return total;
243 }
244 
cpu_exec_micros() const245 int64_t ExecStep::cpu_exec_micros() const {
246   int64_t total = 0;
247   // Normally, an op can only be scheduled on 1 device.
248   for (const auto& execs : cpu_execs_) {
249     // An op can be scheduled multiple times in while-loop.
250     for (const auto& exec : execs.second) {
251       total += exec.second;
252     }
253   }
254   return total;
255 }
256 
ShapeProtoToVec(const TensorShapeProto & shape_pb)257 std::vector<int64_t> ShapeProtoToVec(const TensorShapeProto& shape_pb) {
258   std::vector<int64_t> shape_vec;
259   if (shape_pb.dim_size() == 0 && !shape_pb.unknown_rank()) {
260     // Scalar parameter with empty shape but known rank.
261     shape_vec.push_back(1);
262   } else {
263     for (const auto& d : shape_pb.dim()) {
264       shape_vec.push_back(d.size());
265     }
266   }
267   return shape_vec;
268 }
269 
VecToShapeProto(const std::vector<int64_t> & shape_vec)270 TensorShapeProto VecToShapeProto(const std::vector<int64_t>& shape_vec) {
271   TensorShapeProto shape_pb;
272   if (shape_vec.empty()) {
273     shape_pb.set_unknown_rank(true);
274     return shape_pb;
275   }
276   for (const int64_t s : shape_vec) {
277     shape_pb.add_dim()->set_size(s);
278   }
279   return shape_pb;
280 }
281 
IsPlacedOnAccelerator(const string & device)282 bool IsPlacedOnAccelerator(const string& device) {
283   return device.find("gpu") != device.npos;
284 }
IsPlacedOnCPU(const string & device)285 bool IsPlacedOnCPU(const string& device) {
286   return device.find("cpu") != device.npos;
287 }
288 }  // namespace tfprof
289 }  // namespace tensorflow
290