1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
17
18 #include <memory>
19 #include <stack>
20 #include <unordered_set>
21 #include <vector>
22
23 #include "absl/memory/memory.h"
24 #include "tensorflow/compiler/xla/service/hlo_computation.h"
25 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
26 #include "tensorflow/compiler/xla/service/stream_pool.h"
27 #include "tensorflow/core/platform/logging.h"
28 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
29 #include "tensorflow/core/util/ptr_util.h"
30
31 namespace xla {
32 namespace gpu {
33 namespace {
InitAndStartTimer(std::stack<std::unique_ptr<se::Timer>> * timers,se::Stream * stream)34 void InitAndStartTimer(std::stack<std::unique_ptr<se::Timer>>* timers,
35 se::Stream* stream) {
36 timers->push(absl::make_unique<se::Timer>(stream->parent()));
37 stream->InitTimer(timers->top().get()).ThenStartTimer(timers->top().get());
38 }
39
GetCyclesTaken(std::stack<std::unique_ptr<se::Timer>> * timers,const std::vector<StreamPool::Ptr> & sub_streams,se::Stream * stream,double clock_rate_ghz)40 uint64 GetCyclesTaken(std::stack<std::unique_ptr<se::Timer>>* timers,
41 const std::vector<StreamPool::Ptr>& sub_streams,
42 se::Stream* stream, double clock_rate_ghz) {
43 CHECK_GT(timers->size(), 0);
44 stream->ThenWaitFor(&sub_streams);
45 stream->ThenStopTimer(timers->top().get());
46 stream->BlockHostUntilDone().IgnoreError();
47 double nanoseconds = timers->top()->Nanoseconds();
48 timers->pop();
49 return static_cast<uint64>(nanoseconds * clock_rate_ghz);
50 }
51 } // namespace
52
HloExecutionProfiler(bool do_profile,HloExecutionProfile * profile,se::Stream * stream,const std::vector<StreamPool::Ptr> & sub_streams,size_t index)53 HloExecutionProfiler::HloExecutionProfiler(
54 bool do_profile, HloExecutionProfile* profile, se::Stream* stream,
55 const std::vector<StreamPool::Ptr>& sub_streams, size_t index)
56 : do_profile_(do_profile),
57 profile_(profile),
58 stream_(stream),
59 sub_streams_(sub_streams),
60 computation_profile_index_(index) {
61 if (do_profile_) {
62 clock_rate_ghz_ = stream->parent()->GetDeviceDescription().clock_rate_ghz();
63 InitAndStartTimer(&timers_, stream);
64 }
65 }
66
FinishExecution()67 void HloExecutionProfiler::FinishExecution() {
68 CHECK(!finished_execution_) << "Call FinishExecution only once!";
69 finished_execution_ = true;
70 if (do_profile_) {
71 profile_->SetCyclesTakenBy(
72 computation_profile_index_,
73 GetCyclesTaken(&timers_, sub_streams_, stream_, clock_rate_ghz_));
74 }
75 }
76
StartHloComputation()77 void HloExecutionProfiler::StartHloComputation() {
78 if (do_profile_) {
79 InitAndStartTimer(&timers_, stream_);
80 }
81 }
82
FinishHloComputation(const HloComputation * computation)83 void HloExecutionProfiler::FinishHloComputation(
84 const HloComputation* computation) {
85 if (do_profile_) {
86 profile_->set_total_cycles_executed(
87 *computation,
88 GetCyclesTaken(&timers_, sub_streams_, stream_, clock_rate_ghz_));
89 }
90 }
91
FinishHloComputation(absl::optional<size_t> profile_index)92 void HloExecutionProfiler::FinishHloComputation(
93 absl::optional<size_t> profile_index) {
94 if (do_profile_) {
95 profile_->SetCyclesTakenBy(
96 *profile_index,
97 GetCyclesTaken(&timers_, sub_streams_, stream_, clock_rate_ghz_));
98 }
99 }
100
StartHloInstruction()101 void HloExecutionProfiler::StartHloInstruction() {
102 if (do_profile_) {
103 InitAndStartTimer(&timers_, stream_);
104 }
105 }
106
FinishHloInstruction(size_t index)107 void HloExecutionProfiler::FinishHloInstruction(size_t index) {
108 if (do_profile_) {
109 indices_.erase(index);
110 profile_->SetCyclesTakenBy(index, GetCyclesTaken(&timers_, sub_streams_,
111 stream_, clock_rate_ghz_));
112 }
113 }
114
115 std::unique_ptr<ScopedInstructionProfiler>
MakeScopedInstructionProfiler(absl::optional<int64> index)116 HloExecutionProfiler::MakeScopedInstructionProfiler(
117 absl::optional<int64> index) {
118 if (do_profile_ && index.has_value()) {
119 // Make sure that we are not already measuring the time for the same
120 // instruction.
121 // TODO(timshen): provide more useful printout.
122 CHECK(indices_.insert(*index).second) << *index;
123 }
124 return absl::make_unique<ScopedInstructionProfiler>(this, index);
125 }
126
127 } // namespace gpu
128 } // namespace xla
129