• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
17 
18 #include <memory>
19 #include <stack>
20 #include <unordered_set>
21 #include <vector>
22 
23 #include "absl/memory/memory.h"
24 #include "tensorflow/compiler/xla/service/hlo_computation.h"
25 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
26 #include "tensorflow/compiler/xla/service/stream_pool.h"
27 #include "tensorflow/core/platform/logging.h"
28 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
29 #include "tensorflow/core/util/ptr_util.h"
30 
31 namespace xla {
32 namespace gpu {
33 namespace {
InitAndStartTimer(std::stack<std::unique_ptr<se::Timer>> * timers,se::Stream * stream)34 void InitAndStartTimer(std::stack<std::unique_ptr<se::Timer>>* timers,
35                        se::Stream* stream) {
36   timers->push(absl::make_unique<se::Timer>(stream->parent()));
37   stream->InitTimer(timers->top().get()).ThenStartTimer(timers->top().get());
38 }
39 
GetCyclesTaken(std::stack<std::unique_ptr<se::Timer>> * timers,const std::vector<StreamPool::Ptr> & sub_streams,se::Stream * stream,double clock_rate_ghz)40 uint64 GetCyclesTaken(std::stack<std::unique_ptr<se::Timer>>* timers,
41                       const std::vector<StreamPool::Ptr>& sub_streams,
42                       se::Stream* stream, double clock_rate_ghz) {
43   CHECK_GT(timers->size(), 0);
44   stream->ThenWaitFor(&sub_streams);
45   stream->ThenStopTimer(timers->top().get());
46   stream->BlockHostUntilDone().IgnoreError();
47   double nanoseconds = timers->top()->Nanoseconds();
48   timers->pop();
49   return static_cast<uint64>(nanoseconds * clock_rate_ghz);
50 }
51 }  // namespace
52 
HloExecutionProfiler(bool do_profile,HloExecutionProfile * profile,se::Stream * stream,const std::vector<StreamPool::Ptr> & sub_streams,size_t index)53 HloExecutionProfiler::HloExecutionProfiler(
54     bool do_profile, HloExecutionProfile* profile, se::Stream* stream,
55     const std::vector<StreamPool::Ptr>& sub_streams, size_t index)
56     : do_profile_(do_profile),
57       profile_(profile),
58       stream_(stream),
59       sub_streams_(sub_streams),
60       computation_profile_index_(index) {
61   if (do_profile_) {
62     clock_rate_ghz_ = stream->parent()->GetDeviceDescription().clock_rate_ghz();
63     InitAndStartTimer(&timers_, stream);
64   }
65 }
66 
FinishExecution()67 void HloExecutionProfiler::FinishExecution() {
68   CHECK(!finished_execution_) << "Call FinishExecution only once!";
69   finished_execution_ = true;
70   if (do_profile_) {
71     profile_->SetCyclesTakenBy(
72         computation_profile_index_,
73         GetCyclesTaken(&timers_, sub_streams_, stream_, clock_rate_ghz_));
74   }
75 }
76 
StartHloComputation()77 void HloExecutionProfiler::StartHloComputation() {
78   if (do_profile_) {
79     InitAndStartTimer(&timers_, stream_);
80   }
81 }
82 
FinishHloComputation(const HloComputation * computation)83 void HloExecutionProfiler::FinishHloComputation(
84     const HloComputation* computation) {
85   if (do_profile_) {
86     profile_->set_total_cycles_executed(
87         *computation,
88         GetCyclesTaken(&timers_, sub_streams_, stream_, clock_rate_ghz_));
89   }
90 }
91 
FinishHloComputation(absl::optional<size_t> profile_index)92 void HloExecutionProfiler::FinishHloComputation(
93     absl::optional<size_t> profile_index) {
94   if (do_profile_) {
95     profile_->SetCyclesTakenBy(
96         *profile_index,
97         GetCyclesTaken(&timers_, sub_streams_, stream_, clock_rate_ghz_));
98   }
99 }
100 
StartHloInstruction()101 void HloExecutionProfiler::StartHloInstruction() {
102   if (do_profile_) {
103     InitAndStartTimer(&timers_, stream_);
104   }
105 }
106 
FinishHloInstruction(size_t index)107 void HloExecutionProfiler::FinishHloInstruction(size_t index) {
108   if (do_profile_) {
109     indices_.erase(index);
110     profile_->SetCyclesTakenBy(index, GetCyclesTaken(&timers_, sub_streams_,
111                                                      stream_, clock_rate_ghz_));
112   }
113 }
114 
115 std::unique_ptr<ScopedInstructionProfiler>
MakeScopedInstructionProfiler(absl::optional<int64> index)116 HloExecutionProfiler::MakeScopedInstructionProfiler(
117     absl::optional<int64> index) {
118   if (do_profile_ && index.has_value()) {
119     // Make sure that we are not already measuring the time for the same
120     // instruction.
121     // TODO(timshen): provide more useful printout.
122     CHECK(indices_.insert(*index).second) << *index;
123   }
124   return absl::make_unique<ScopedInstructionProfiler>(this, index);
125 }
126 
127 }  // namespace gpu
128 }  // namespace xla
129