1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h"
17
18 #include <algorithm>
19 #include <unordered_set>
20
21 #include "absl/memory/memory.h"
22 #include "tensorflow/compiler/xla/service/gpu/stream_assignment.h"
23 #include "tensorflow/compiler/xla/service/hlo_computation.h"
24 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
25 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
26 #include "tensorflow/compiler/xla/test_helpers.h"
27 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
28 #include "tensorflow/compiler/xla/tests/test_utils.h"
29 #include "tensorflow/compiler/xla/types.h"
30
31 namespace xla {
32 namespace gpu {
33
34 class GpuHloScheduleTest : public HloTestBase {
35 protected:
36 using HloVec = std::vector<HloInstruction*>;
37
38 // Pre-canned shapes.
39 Shape f32_2x2_ = ShapeUtil::MakeShape(F32, {2, 2});
40
BuildGpuHloSchedule(const HloModule & module,const StreamAssignment & streams)41 static std::unique_ptr<GpuHloSchedule> BuildGpuHloSchedule(
42 const HloModule& module, const StreamAssignment& streams) {
43 return GpuHloSchedule::Build(module, streams, /*pointer_size=*/8)
44 .ConsumeValueOrDie();
45 }
46
CreateNewVerifiedModule()47 std::unique_ptr<HloModule> CreateNewVerifiedModule() {
48 HloModuleConfig config;
49 auto debug_options = GetDebugOptionsForTest();
50 debug_options.set_xla_gpu_disable_multi_streaming(false);
51 config.set_debug_options(debug_options);
52 return absl::make_unique<HloModule>("test_module", config);
53 }
54
RemoveHlo(const HloVec & input,const std::unordered_set<const HloInstruction * > & remove)55 HloVec RemoveHlo(const HloVec& input,
56 const std::unordered_set<const HloInstruction*>& remove) {
57 HloVec result(input);
58 result.erase(std::remove_if(result.begin(), result.end(),
59 [&remove](const HloInstruction* x) {
60 return remove.count(x) > 0;
61 }),
62 result.end());
63 return result;
64 }
65 };
66
67 // Test of a single stream, where data dependencies fully determine the
68 // execution order.
TEST_F(GpuHloScheduleTest,SequentialMatMul)69 TEST_F(GpuHloScheduleTest, SequentialMatMul) {
70 HloComputation::Builder builder("entry_computation");
71 HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter(
72 /*parameter_number=*/0, f32_2x2_, /*name=*/"x"));
73 HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter(
74 /*parameter_number=*/1, f32_2x2_, /*name=*/"y"));
75 HloInstruction* z = builder.AddInstruction(HloInstruction::CreateParameter(
76 /*parameter_number=*/2, f32_2x2_, /*name=*/"z"));
77 HloInstruction* dot1 =
78 builder.AddInstruction(CreateCanonicalDot(f32_2x2_, x, y));
79 HloInstruction* dot2 =
80 builder.AddInstruction(CreateCanonicalDot(f32_2x2_, dot1, z));
81
82 auto module = CreateNewVerifiedModule();
83 module->AddEntryComputation(builder.Build(dot2));
84
85 std::unique_ptr<StreamAssignment> streams = AssignStreams(*module);
86 EXPECT_EQ(streams->StreamNumberForHlo(*dot1),
87 streams->StreamNumberForHlo(*dot2));
88
89 auto schedule = BuildGpuHloSchedule(*module, *streams);
90 // Remove parameters, which are unordered.
91 EXPECT_EQ(RemoveHlo(schedule->ThunkLaunchOrder(), {x, y, z}),
92 HloVec({dot1, dot2}));
93
94 // Parameters x,y,z are mutually unordered, while dot1 and dot2 are
95 // transitively ordered by operands.
96 auto order = schedule->ConsumeHloOrdering();
97 EXPECT_TRUE(order->ExecutesBefore(x, dot1));
98 EXPECT_TRUE(order->ExecutesBefore(x, dot2));
99 EXPECT_TRUE(order->ExecutesBefore(y, dot1));
100 EXPECT_TRUE(order->ExecutesBefore(y, dot2));
101 EXPECT_TRUE(order->ExecutesBefore(z, dot2));
102 EXPECT_TRUE(order->ExecutesBefore(dot1, dot2));
103
104 EXPECT_FALSE(order->ExecutesBefore(x, x));
105 EXPECT_FALSE(order->ExecutesBefore(x, y));
106 EXPECT_FALSE(order->ExecutesBefore(x, z));
107 EXPECT_FALSE(order->ExecutesBefore(y, x));
108 EXPECT_FALSE(order->ExecutesBefore(y, y));
109 EXPECT_FALSE(order->ExecutesBefore(y, z));
110 EXPECT_FALSE(order->ExecutesBefore(z, x));
111 EXPECT_FALSE(order->ExecutesBefore(z, y));
112 EXPECT_FALSE(order->ExecutesBefore(z, z));
113 EXPECT_FALSE(order->ExecutesBefore(z, dot1));
114 EXPECT_FALSE(order->ExecutesBefore(dot1, x));
115 EXPECT_FALSE(order->ExecutesBefore(dot1, y));
116 EXPECT_FALSE(order->ExecutesBefore(dot1, z));
117 EXPECT_FALSE(order->ExecutesBefore(dot1, dot1));
118 EXPECT_FALSE(order->ExecutesBefore(dot2, x));
119 EXPECT_FALSE(order->ExecutesBefore(dot2, y));
120 EXPECT_FALSE(order->ExecutesBefore(dot2, z));
121 EXPECT_FALSE(order->ExecutesBefore(dot2, dot1));
122 EXPECT_FALSE(order->ExecutesBefore(dot2, dot2));
123 }
124
125 // Test of a single stream, where data dependencies do not fully determine the
126 // execution order, but the stream assignment does.
TEST_F(GpuHloScheduleTest,SequentialAdd)127 TEST_F(GpuHloScheduleTest, SequentialAdd) {
128 HloComputation::Builder builder("entry_computation");
129 HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter(
130 /*parameter_number=*/0, f32_2x2_, /*name=*/"x"));
131 HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter(
132 /*parameter_number=*/1, f32_2x2_, /*name=*/"y"));
133 HloInstruction* z = builder.AddInstruction(HloInstruction::CreateParameter(
134 /*parameter_number=*/2, f32_2x2_, /*name=*/"z"));
135 HloInstruction* add1 = builder.AddInstruction(
136 HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kAdd, x, y));
137 HloInstruction* add2 = builder.AddInstruction(
138 HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kAdd, y, z));
139 HloInstruction* add3 = builder.AddInstruction(
140 HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kAdd, add1, add2));
141
142 auto module = CreateNewVerifiedModule();
143 module->AddEntryComputation(builder.Build(add3));
144
145 std::unique_ptr<StreamAssignment> streams = AssignStreams(*module);
146 EXPECT_EQ(streams->StreamNumberForHlo(*add1),
147 streams->StreamNumberForHlo(*add2));
148 EXPECT_EQ(streams->StreamNumberForHlo(*add1),
149 streams->StreamNumberForHlo(*add3));
150
151 auto schedule = BuildGpuHloSchedule(*module, *streams);
152 // Remove parameters, which are unordered.
153 EXPECT_EQ(RemoveHlo(schedule->ThunkLaunchOrder(), {x, y, z}),
154 HloVec({add1, add2, add3}));
155
156 // Parameters x,y,z are mutually unordered, while add1, add2 and add3 are
157 // transitively ordered by operands.
158 auto order = schedule->ConsumeHloOrdering();
159 EXPECT_TRUE(order->ExecutesBefore(x, add1));
160 EXPECT_TRUE(order->ExecutesBefore(x, add2));
161 EXPECT_TRUE(order->ExecutesBefore(x, add3));
162 EXPECT_TRUE(order->ExecutesBefore(y, add1));
163 EXPECT_TRUE(order->ExecutesBefore(y, add2));
164 EXPECT_TRUE(order->ExecutesBefore(y, add3));
165 EXPECT_TRUE(order->ExecutesBefore(z, add2));
166 EXPECT_TRUE(order->ExecutesBefore(z, add3));
167 EXPECT_TRUE(order->ExecutesBefore(add1, add3));
168 EXPECT_TRUE(order->ExecutesBefore(add2, add3));
169 // The HLO graph does not define an ordering for add1 and add2, but their
170 // assignment onto the same stream does define an ordering.
171 if (order->ExecutesBefore(add1, add2)) {
172 EXPECT_FALSE(order->ExecutesBefore(add2, add1));
173 } else {
174 EXPECT_TRUE(order->ExecutesBefore(add2, add1));
175 EXPECT_FALSE(order->ExecutesBefore(add1, add2));
176 }
177
178 EXPECT_FALSE(order->ExecutesBefore(x, x));
179 EXPECT_FALSE(order->ExecutesBefore(x, y));
180 EXPECT_FALSE(order->ExecutesBefore(x, z));
181 EXPECT_FALSE(order->ExecutesBefore(y, x));
182 EXPECT_FALSE(order->ExecutesBefore(y, y));
183 EXPECT_FALSE(order->ExecutesBefore(y, z));
184 EXPECT_FALSE(order->ExecutesBefore(z, x));
185 EXPECT_FALSE(order->ExecutesBefore(z, y));
186 EXPECT_FALSE(order->ExecutesBefore(z, z));
187 EXPECT_FALSE(order->ExecutesBefore(z, add1));
188 EXPECT_FALSE(order->ExecutesBefore(add1, x));
189 EXPECT_FALSE(order->ExecutesBefore(add1, y));
190 EXPECT_FALSE(order->ExecutesBefore(add1, z));
191 EXPECT_FALSE(order->ExecutesBefore(add1, add1));
192 EXPECT_FALSE(order->ExecutesBefore(add2, x));
193 EXPECT_FALSE(order->ExecutesBefore(add2, y));
194 EXPECT_FALSE(order->ExecutesBefore(add2, z));
195 EXPECT_FALSE(order->ExecutesBefore(add2, add2));
196 }
197
198 // Test of two streams.
TEST_F(GpuHloScheduleTest,ConcurrentMatMul)199 TEST_F(GpuHloScheduleTest, ConcurrentMatMul) {
200 HloComputation::Builder builder("entry_computation");
201 HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter(
202 /*parameter_number=*/0, f32_2x2_, /*name=*/"x"));
203 HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter(
204 /*parameter_number=*/1, f32_2x2_, /*name=*/"y"));
205 HloInstruction* dot1 =
206 builder.AddInstruction(CreateCanonicalDot(f32_2x2_, x, y));
207 HloInstruction* dot2 =
208 builder.AddInstruction(CreateCanonicalDot(f32_2x2_, y, x));
209 HloInstruction* add =
210 builder.AddInstruction(CreateCanonicalDot(f32_2x2_, dot1, dot2));
211
212 auto module = CreateNewVerifiedModule();
213 module->AddEntryComputation(builder.Build(add));
214
215 std::unique_ptr<StreamAssignment> streams = AssignStreams(*module);
216 EXPECT_NE(streams->StreamNumberForHlo(*dot1),
217 streams->StreamNumberForHlo(*dot2));
218
219 auto schedule = BuildGpuHloSchedule(*module, *streams);
220 // Remove parameters, which are unordered.
221 HloVec thunk_launch_order = RemoveHlo(schedule->ThunkLaunchOrder(), {x, y});
222 EXPECT_TRUE(thunk_launch_order == HloVec({dot1, dot2, add}) ||
223 thunk_launch_order == HloVec({dot2, dot1, add}));
224
225 // Parameters x,y are mutually unordered, while dot1, dot2 and add are
226 // transitively ordered by operands.
227 auto order = schedule->ConsumeHloOrdering();
228 EXPECT_TRUE(order->ExecutesBefore(x, dot1));
229 EXPECT_TRUE(order->ExecutesBefore(x, dot2));
230 EXPECT_TRUE(order->ExecutesBefore(y, dot1));
231 EXPECT_TRUE(order->ExecutesBefore(y, dot2));
232 EXPECT_TRUE(order->ExecutesBefore(dot1, add));
233 EXPECT_TRUE(order->ExecutesBefore(dot2, add));
234
235 EXPECT_FALSE(order->ExecutesBefore(x, x));
236 EXPECT_FALSE(order->ExecutesBefore(x, y));
237 EXPECT_FALSE(order->ExecutesBefore(y, x));
238 EXPECT_FALSE(order->ExecutesBefore(y, y));
239 EXPECT_FALSE(order->ExecutesBefore(dot1, x));
240 EXPECT_FALSE(order->ExecutesBefore(dot1, y));
241 EXPECT_FALSE(order->ExecutesBefore(dot1, dot1));
242 EXPECT_FALSE(order->ExecutesBefore(dot1, dot2));
243 EXPECT_FALSE(order->ExecutesBefore(dot2, x));
244 EXPECT_FALSE(order->ExecutesBefore(dot2, y));
245 EXPECT_FALSE(order->ExecutesBefore(dot2, dot1));
246 EXPECT_FALSE(order->ExecutesBefore(dot2, dot2));
247 EXPECT_FALSE(order->ExecutesBefore(add, x));
248 EXPECT_FALSE(order->ExecutesBefore(add, y));
249 EXPECT_FALSE(order->ExecutesBefore(add, dot1));
250 EXPECT_FALSE(order->ExecutesBefore(add, dot2));
251 EXPECT_FALSE(order->ExecutesBefore(add, add));
252 }
253
254 // Test of multiple streams.
TEST_F(GpuHloScheduleTest,LatticeMatMul)255 TEST_F(GpuHloScheduleTest, LatticeMatMul) {
256 // d00 -- layer 0
257 // / \
258 // d10 d11 -- layer 1
259 // / \ / \
260 // d20 d21 d22 -- layer 2
261 // \ / \ /
262 // d30 d31 -- layer 3
263 // \ /
264 // d40 -- layer 4
265 HloComputation::Builder builder("entry_computation");
266 std::vector<HloInstruction*> params;
267 params.reserve(6);
268 for (int i = 0; i < 6; ++i) {
269 params.push_back(builder.AddInstruction(HloInstruction::CreateParameter(
270 i, f32_2x2_, /*name=*/absl::StrFormat("param%d", i))));
271 }
272 HloInstruction* d00 = builder.AddInstruction(
273 CreateCanonicalDot(f32_2x2_, params[2], params[3]));
274 HloInstruction* d10 =
275 builder.AddInstruction(CreateCanonicalDot(f32_2x2_, params[1], d00));
276 HloInstruction* d11 =
277 builder.AddInstruction(CreateCanonicalDot(f32_2x2_, d00, params[4]));
278 HloInstruction* d20 =
279 builder.AddInstruction(CreateCanonicalDot(f32_2x2_, params[0], d10));
280 HloInstruction* d21 =
281 builder.AddInstruction(CreateCanonicalDot(f32_2x2_, d10, d11));
282 HloInstruction* d22 =
283 builder.AddInstruction(CreateCanonicalDot(f32_2x2_, d11, params[5]));
284 HloInstruction* d30 =
285 builder.AddInstruction(CreateCanonicalDot(f32_2x2_, d20, d21));
286 HloInstruction* d31 =
287 builder.AddInstruction(CreateCanonicalDot(f32_2x2_, d21, d22));
288 HloInstruction* d40 =
289 builder.AddInstruction(CreateCanonicalDot(f32_2x2_, d30, d31));
290
291 auto module = CreateNewVerifiedModule();
292 module->AddEntryComputation(builder.Build(d40));
293
294 std::unique_ptr<StreamAssignment> streams = AssignStreams(*module);
295 // The two dots on layer 1 are concurrent.
296 EXPECT_NE(streams->StreamNumberForHlo(*d10),
297 streams->StreamNumberForHlo(*d11));
298 // The three dots on layer 2 are concurrent.
299 EXPECT_NE(streams->StreamNumberForHlo(*d20),
300 streams->StreamNumberForHlo(*d21));
301 EXPECT_NE(streams->StreamNumberForHlo(*d20),
302 streams->StreamNumberForHlo(*d22));
303 EXPECT_NE(streams->StreamNumberForHlo(*d21),
304 streams->StreamNumberForHlo(*d22));
305 // The two dots on layer 3 are concurrent.
306 EXPECT_NE(streams->StreamNumberForHlo(*d30),
307 streams->StreamNumberForHlo(*d31));
308
309 // We don't check the thunk launch order, since there are many valid total
310 // orders, and it's annoying to express.
311 auto schedule = BuildGpuHloSchedule(*module, *streams);
312
313 auto order = schedule->ConsumeHloOrdering();
314 const HloVec all_params(
315 {params[0], params[1], params[2], params[3], params[4], params[5]});
316 const HloVec all_ops({d00, d10, d11, d20, d21, d22, d30, d31, d40});
317
318 // Parameters are mutually unordered, and never execute before ops.
319 for (const HloInstruction* param : all_params) {
320 for (const HloInstruction* param2 : all_params) {
321 EXPECT_FALSE(order->ExecutesBefore(param, param2));
322 }
323 for (const HloInstruction* op : all_ops) {
324 EXPECT_FALSE(order->ExecutesBefore(op, param));
325 }
326 }
327
328 // Check ordering of params before ops.
329 for (const HloInstruction* op : all_ops) {
330 if (op == d20 || op == d30 || op == d40) {
331 EXPECT_TRUE(order->ExecutesBefore(params[0], op));
332 } else {
333 EXPECT_FALSE(order->ExecutesBefore(params[0], op));
334 }
335 if (op != d00 && op != d11 && op != d22) {
336 EXPECT_TRUE(order->ExecutesBefore(params[1], op));
337 } else {
338 EXPECT_FALSE(order->ExecutesBefore(params[1], op));
339 }
340 EXPECT_TRUE(order->ExecutesBefore(params[2], op));
341 EXPECT_TRUE(order->ExecutesBefore(params[3], op));
342 if (op != d00 && op != d10 && op != d20) {
343 EXPECT_TRUE(order->ExecutesBefore(params[4], op));
344 } else {
345 EXPECT_FALSE(order->ExecutesBefore(params[4], op));
346 }
347 if (op == d22 || op == d31 || op == d40) {
348 EXPECT_TRUE(order->ExecutesBefore(params[5], op));
349 } else {
350 EXPECT_FALSE(order->ExecutesBefore(params[5], op));
351 }
352 }
353
354 // Check ordering of ops before ops.
355 for (const HloInstruction* op : all_ops) {
356 if (op != d00) {
357 EXPECT_TRUE(order->ExecutesBefore(d00, op));
358 } else {
359 EXPECT_FALSE(order->ExecutesBefore(d00, op));
360 }
361
362 if (op == d20 || op == d21 || op == d30 || op == d31 || op == d40) {
363 EXPECT_TRUE(order->ExecutesBefore(d10, op));
364 } else {
365 EXPECT_FALSE(order->ExecutesBefore(d10, op));
366 }
367
368 if (op == d21 || op == d22 || op == d30 || op == d31 || op == d40) {
369 EXPECT_TRUE(order->ExecutesBefore(d11, op));
370 } else {
371 EXPECT_FALSE(order->ExecutesBefore(d11, op));
372 }
373
374 if (op == d30 || op == d40) {
375 EXPECT_TRUE(order->ExecutesBefore(d20, op));
376 } else {
377 EXPECT_FALSE(order->ExecutesBefore(d20, op));
378 }
379
380 if (op == d30 || op == d31 || op == d40) {
381 EXPECT_TRUE(order->ExecutesBefore(d21, op));
382 } else {
383 EXPECT_FALSE(order->ExecutesBefore(d21, op));
384 }
385
386 if (op == d31 || op == d40) {
387 EXPECT_TRUE(order->ExecutesBefore(d22, op));
388 } else {
389 EXPECT_FALSE(order->ExecutesBefore(d22, op));
390 }
391
392 if (op == d40) {
393 EXPECT_TRUE(order->ExecutesBefore(d30, op));
394 EXPECT_TRUE(order->ExecutesBefore(d31, op));
395 } else {
396 EXPECT_FALSE(order->ExecutesBefore(d30, op));
397 EXPECT_FALSE(order->ExecutesBefore(d31, op));
398 }
399
400 EXPECT_FALSE(order->ExecutesBefore(d40, op));
401 }
402 }
403
404 } // namespace gpu
405 } // namespace xla
406