1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include <deque>
17 #include <memory>
18 #include <unordered_map>
19
20 #include "tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h"
21
22 #include "absl/memory/memory.h"
23 #include "tensorflow/compiler/xla/service/buffer_value.h"
24 #include "tensorflow/compiler/xla/service/hlo_memory_scheduler.h"
25 #include "tensorflow/compiler/xla/service/hlo_reachability.h"
26 #include "tensorflow/compiler/xla/service/hlo_schedule.h"
27 #include "tensorflow/compiler/xla/types.h"
28
29 namespace xla {
30 namespace gpu {
31
32 namespace {
33
34 // An HLO partial ordering based on the actual stream assignment and thunk
35 // launch order.
36 class GpuHloOrdering : public PredecessorHloOrdering {
37 public:
38 GpuHloOrdering(const HloModule* module,
39 const StreamAssignment& stream_assignment,
40 const std::vector<HloInstruction*>& thunk_launch_order);
41 ~GpuHloOrdering() override = default;
42
43 // Only the entry computation can possibly be sequentially ordered, and only
44 // if we've assigned all instructions to a single stream.
SequentialOrder(const HloComputation & computation) const45 const HloInstructionSequence* SequentialOrder(
46 const HloComputation& computation) const override {
47 return &computation == module_->entry_computation() ? entry_sequence_.get()
48 : nullptr;
49 }
50
ToString() const51 string ToString() const override { return ToStringHelper("GpuHloOrdering"); }
52
53 private:
54 std::unique_ptr<HloInstructionSequence> entry_sequence_;
55 };
56
GpuHloOrdering(const HloModule * module,const StreamAssignment & stream_assignment,const std::vector<HloInstruction * > & thunk_launch_order)57 GpuHloOrdering::GpuHloOrdering(
58 const HloModule* module, const StreamAssignment& stream_assignment,
59 const std::vector<HloInstruction*>& thunk_launch_order)
60 : PredecessorHloOrdering(module) {
61 // The entry computation has a total order when there's only one stream.
62 if (stream_assignment.StreamCount() == 1) {
63 entry_sequence_ =
64 absl::make_unique<HloInstructionSequence>(thunk_launch_order);
65 }
66
67 // The ordering of instructions for the entry computation is determined by the
68 // total order of thunk launches, and stream assignment. Instructions are
69 // sequential within a stream and concurrent across streams. In addition, the
70 // GpuExecutable adds cross-stream dependency edges to ensure each instruction
71 // waits for its operands before executing.
72 //
73 // The predecessor map is built incrementally, in thunk launch order. We
74 // record the most-recently seen instructions per stream in
75 // 'last_instruction_per_stream'. This lets us quickly determine the
76 // same-stream predecessors of each instruction.
77
78 // Compute the set of all instructions we will want to set reachability on.
79 auto predecessor_map = absl::make_unique<HloReachabilityMap>(
80 module->entry_computation()->MakeInstructionPostOrder());
81
82 // The most recently visited instruction per stream.
83 std::vector<const HloInstruction*> last_instruction_per_stream(
84 stream_assignment.StreamCount(), nullptr);
85
86 for (const HloInstruction* hlo : thunk_launch_order) {
87 predecessor_map->SetReachable(hlo, hlo);
88 if (stream_assignment.HasStreamAssigned(*hlo)) {
89 // Gather all instruction which are immediate predecessors of 'hlo' in the
90 // reachability graph.
91 std::vector<const HloInstruction*> immediate_preds;
92 immediate_preds.insert(immediate_preds.end(), hlo->operands().begin(),
93 hlo->operands().end());
94 immediate_preds.insert(immediate_preds.end(),
95 hlo->control_predecessors().begin(),
96 hlo->control_predecessors().end());
97
98 // All ops already queued on the same instruction stream, and their
99 // transitive predecessors, are predecessors.
100 const int stream_no = stream_assignment.StreamNumberForHlo(*hlo);
101 if (last_instruction_per_stream[stream_no] != nullptr) {
102 immediate_preds.push_back(last_instruction_per_stream[stream_no]);
103 }
104 predecessor_map->FastSetReachabilityToUnion(immediate_preds, hlo);
105 last_instruction_per_stream[stream_no] = hlo;
106 } else {
107 // Only parameters and constants don't have an assigned stream, since they
108 // don't require a thunk. These ops don't have any predecessors.
109 CHECK(hlo->opcode() == HloOpcode::kParameter ||
110 hlo->opcode() == HloOpcode::kConstant);
111 CHECK_EQ(hlo->operand_count(), 0);
112 }
113 }
114 predecessors_.emplace(module->entry_computation(),
115 std::move(predecessor_map));
116
117 // The ordering of instructions in subcomputations is based solely on control
118 // and data dependencies.
119 //
120 // TODO(toddw): Each subcomputation is actually emitted as a function in DFS
121 // postorder, so we can do better and establish the total order here. We don't
122 // do that yet since it's hard to ensure that the order here is the order used
123 // by IrEmitterNested. And mismatched ordering bugs would be hard to find.
124 for (auto* computation : module->computations()) {
125 if (computation != module->entry_computation() &&
126 !computation->IsFusionComputation()) {
127 predecessors_.emplace(computation,
128 HloReachabilityMap::Build(computation));
129 }
130 }
131 }
132
133 // Computes a topological launch_order that is close to a breadth-first
134 // order. This heuristic works well for graphs where concurrent kernels are
135 // located at the same layer. It can often reduce dependency between concurrent
136 // GEMMs due to intra-stream total orders. E.g. consider the following HLO
137 // graph where the numbers in the parens indicate the stream assigned to each
138 // HLO.
139 //
140 // A(0) -> D(0) -> E(1)
141 // |
142 // v
143 // B(0)
144 // |
145 // v
146 // C(0)
147 //
148 // If the total order is A,B,C,D,E, then C and E would be sequentialized
149 // because C completes before D starts in stream 0, and E depends on D.
150 // However, if the total order is A,B,D,C,E, then C and E can run
151 // concurrently.
BFSLaunchOrder(const HloComputation * computation,std::vector<HloInstruction * > * launch_order)152 void BFSLaunchOrder(const HloComputation* computation,
153 std::vector<HloInstruction*>* launch_order) {
154 // This topological sort uses two data structures:
155 // 1. `incoming_edge_count` which keeps track of the number of incoming
156 // edges to each HLO;
157 // 2. `queue` which contains all HLOs with no incoming edges.
158 //
159 // The sorting algorithm repeatedly pops the top from the queue and deletes
160 // that HLO from the graph, making more HLOs incoming-edge free.
161 std::deque<HloInstruction*> queue;
162 std::unordered_map<const HloInstruction*, int64> incoming_edge_count;
163 for (auto* hlo : computation->instructions()) {
164 if (hlo->operand_count() == 0) {
165 queue.push_back(hlo);
166 } else {
167 incoming_edge_count[hlo] =
168 std::set<HloInstruction*>(hlo->operands().begin(),
169 hlo->operands().end())
170 .size();
171 }
172 }
173
174 while (!queue.empty()) {
175 HloInstruction* x = queue.front();
176 queue.pop_front();
177 launch_order->push_back(x);
178 for (HloInstruction* y : x->users()) {
179 --incoming_edge_count[y];
180 if (incoming_edge_count[y] == 0) {
181 queue.push_back(y);
182 }
183 }
184 }
185 }
186
187 } // end namespace
188
GpuHloSchedule()189 GpuHloSchedule::GpuHloSchedule() {}
190
191 /* static */
Build(const HloModule & module,const StreamAssignment & stream_assignment,int64 pointer_size)192 StatusOr<std::unique_ptr<GpuHloSchedule>> GpuHloSchedule::Build(
193 const HloModule& module, const StreamAssignment& stream_assignment,
194 int64 pointer_size) {
195 std::unique_ptr<GpuHloSchedule> schedule(new GpuHloSchedule);
196
197 // Initialize thunk_launch_order_, the total order of thunk launches.
198 HloComputation* entry_computation = module.entry_computation();
199 if (stream_assignment.StreamCount() == 1) {
200 // All kernels are launched on a single stream, so there's no loss of
201 // concurrency by optimizing for minimal memory usage.
202 TF_ASSIGN_OR_RETURN(
203 HloInstructionSequence sequence,
204 ScheduleComputation(
205 entry_computation, [pointer_size](const BufferValue& buffer) {
206 return ShapeUtil::ByteSizeOf(buffer.shape(), pointer_size);
207 }));
208 schedule->thunk_launch_order_ = sequence.instructions();
209 } else {
210 // BFS tends to increase concurrency, but also increases memory usage.
211 BFSLaunchOrder(entry_computation, &schedule->thunk_launch_order_);
212 }
213
214 schedule->hlo_ordering_ = absl::make_unique<GpuHloOrdering>(
215 &module, stream_assignment, schedule->thunk_launch_order_);
216
217 return std::move(schedule);
218 }
219
220 } // namespace gpu
221 } // namespace xla
222