1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
17 #include "tensorflow/compiler/xla/service/hlo_parser.h"
18 #include "tensorflow/compiler/xla/tests/filecheck.h"
19
20 namespace xla {
21 namespace gpu {
22
23 namespace {
24
25 class ParallelReductionTest : public GpuCodegenTest {
GetDebugOptionsForTest()26 DebugOptions GetDebugOptionsForTest() override {
27 DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
28 // The test contains a MOF fusion and the XLA optimizer passes
29 // don't like this.
30 debug_options.set_xla_disable_all_hlo_passes(true);
31 return debug_options;
32 }
33 };
34
TEST_F(ParallelReductionTest,TwoParallelReductions)35 TEST_F(ParallelReductionTest, TwoParallelReductions) {
36 const char* hlo_text = R"(
37 HloModule TwoParallelReductions
38
39 %add_f32 {
40 %x = f32[] parameter(0)
41 %y = f32[] parameter(1)
42 ROOT %add = f32[] add(%x, %y)
43 }
44
45 %fused_computation {
46 %param0 = f32[1024] parameter(0)
47 %param1 = f32[1024] parameter(1)
48 %constant0 = f32[] constant(0)
49 %reduce1 = f32[] reduce(%param0, %constant0), dimensions={0}, to_apply=%add_f32
50 %reduce2 = f32[] reduce(%param1, %constant0), dimensions={0}, to_apply=%add_f32
51 ROOT %tuple = (f32[], f32[]) tuple(%reduce1, %reduce2)
52 }
53
54 ENTRY %cluster {
55 %param0 = f32[1024] parameter(0)
56 %param1 = f32[1024] parameter(1)
57 ROOT %fusion = (f32[], f32[])
58 fusion(%param0, %param1), kind=kInput, calls=%fused_computation
59 }
60 )";
61
62 TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> hlo_module,
63 ParseAndReturnVerifiedModule(hlo_text));
64 CompileAndVerifyIr(std::move(hlo_module),
65 R"(
66 CHECK: reduce-group-0
67 CHECK: reduce-group-1
68 CHECK-NOT: reduce-group-2
69 )",
70 /*match_optimized_ir=*/false);
71 EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
72 }
73
TEST_F(ParallelReductionTest,TwoParallelReductionsWithBroadcastOutput)74 TEST_F(ParallelReductionTest, TwoParallelReductionsWithBroadcastOutput) {
75 const char* hlo_text = R"(
76 HloModule TwoParallelReductions
77
78 %add_f32 {
79 %x = f32[] parameter(0)
80 %y = f32[] parameter(1)
81 ROOT %add = f32[] add(%x, %y)
82 }
83
84 %fused_computation {
85 %param0 = f32[] parameter(0)
86 %param1 = f32[] parameter(1)
87 %param2 = f32[] parameter(2)
88 %bcast0 = f32[1024] broadcast(f32[] %param0)
89 %reduce1 = f32[] reduce(%bcast0, %param1), dimensions={0}, to_apply=%add_f32
90 %reduce2 = f32[] reduce(%bcast0, %param2), dimensions={0}, to_apply=%add_f32
91 ROOT %tuple = (f32[], f32[], f32[1024]) tuple(%reduce1, %reduce2, %bcast0)
92 }
93
94 ENTRY %cluster {
95 %param0 = f32[] parameter(0)
96 %param1 = f32[] parameter(1)
97 %param2 = f32[] parameter(2)
98 ROOT %fusion = (f32[], f32[], f32[1024])
99 fusion(%param0, %param1, %param2), kind=kInput, calls=%fused_computation
100 }
101 )";
102
103 TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> hlo_module,
104 ParseAndReturnVerifiedModule(hlo_text));
105 CompileAndVerifyIr(std::move(hlo_module),
106 R"(
107 CHECK: reduce-group-0
108 CHECK: reduce-group-1
109 CHECK-NOT: reduce-group-2
110 )",
111 /*match_optimized_ir=*/false);
112 EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
113 }
114
TEST_F(ParallelReductionTest,ManyParallelReductions)115 TEST_F(ParallelReductionTest, ManyParallelReductions) {
116 std::unique_ptr<VerifiedHloModule> module = CreateNewVerifiedModule();
117 // Simply use a number not too large to avoid long compilation time
118 // and not too small for meaningful test.
119 const size_t num_reduces = 32;
120
121 HloComputation* reduce_computation;
122 {
123 auto embedded_builder = HloComputation::Builder("add");
124 HloInstruction* lhs =
125 embedded_builder.AddInstruction(HloInstruction::CreateParameter(
126 0, ShapeUtil::MakeShape(F32, {}), "lhs"));
127 HloInstruction* rhs =
128 embedded_builder.AddInstruction(HloInstruction::CreateParameter(
129 1, ShapeUtil::MakeShape(F32, {}), "rhs"));
130 embedded_builder.AddInstruction(
131 HloInstruction::CreateBinary(lhs->shape(), HloOpcode::kAdd, lhs, rhs));
132 reduce_computation =
133 module->AddEmbeddedComputation(embedded_builder.Build());
134 }
135
136 Shape input_shape = ShapeUtil::MakeShape(F32, {1024});
137 Shape output_shape = ShapeUtil::MakeShape(F32, {});
138 HloComputation* fusion_computation;
139 {
140 auto fusion_builder = HloComputation::Builder("fusion_computation");
141 std::vector<HloInstruction*> outputs;
142 HloInstruction* constant = fusion_builder.AddInstruction(
143 HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0)));
144 for (size_t i = 0; i < num_reduces; ++i) {
145 HloInstruction* param = fusion_builder.AddInstruction(
146 HloInstruction::CreateParameter(i, input_shape, "param"));
147 HloInstruction* output =
148 fusion_builder.AddInstruction(HloInstruction::CreateReduce(
149 output_shape, param, constant, {0}, reduce_computation));
150 outputs.push_back(output);
151 }
152 fusion_builder.AddInstruction(HloInstruction::CreateTuple(outputs));
153 fusion_computation = module->AddEmbeddedComputation(fusion_builder.Build());
154 }
155
156 HloComputation::Builder b(TestName());
157 std::vector<HloInstruction*> entry_params;
158 std::vector<Shape> output_shapes;
159 entry_params.reserve(num_reduces);
160 output_shapes.reserve(num_reduces);
161 for (size_t i = 0; i < num_reduces; ++i) {
162 HloInstruction* param = b.AddInstruction(
163 HloInstruction::CreateParameter(i, input_shape, "param"));
164 entry_params.push_back(param);
165 output_shapes.push_back(output_shape);
166 }
167 b.AddInstruction(HloInstruction::CreateFusion(
168 ShapeUtil::MakeTupleShape(output_shapes),
169 HloInstruction::FusionKind::kInput, entry_params, fusion_computation));
170 module->AddEntryComputation(b.Build());
171
172 EXPECT_TRUE(RunAndCompare(std::move(module), ErrorSpec{1e-5, 1e-5}));
173 }
174
TEST_F(ParallelReductionTest,ThreeReductionGroups)175 TEST_F(ParallelReductionTest, ThreeReductionGroups) {
176 const char* hlo_text = R"(
177 HloModule ThreeReductionGroups
178
179 %add_f32 {
180 %x = f32[] parameter(0)
181 %y = f32[] parameter(1)
182 ROOT %add = f32[] add(%x, %y)
183 }
184
185 %fused_computation {
186 %param0 = f32[1024,128] parameter(0)
187 %param1 = f32[1024,128] parameter(1)
188 %param2 = f32[1024,128] parameter(2)
189 %constant0 = f32[] constant(0)
190 // %mul0, %reduce0, and %reduce1 should go into a group.
191 %broadcast0 = f32[1024,128] broadcast(%constant0), dimensions={}
192 %mul0 = f32[1024,128] multiply(param0, broadcast0)
193 %reduce0 = f32[128] reduce(%mul0, %constant0), dimensions={0}, to_apply=%add_f32
194 %reduce1 = f32[128] reduce(%param0, %constant0), dimensions={0}, to_apply=%add_f32
195 // %reduce2 and %reduce3 should go into another group.
196 %reduce2 = f32[128] reduce(%param1, %constant0), dimensions={0}, to_apply=%add_f32
197 %reduce3 = f32[128] reduce(%param1, %constant0), dimensions={0}, to_apply=%add_f32
198 // %reduce4 and %mul2 should go into the other group, although broadcast0 is
199 // reused.
200 %mul1 = f32[1024,128] multiply(param2, broadcast0)
201 %reduce4 = f32[128] reduce(%mul1, %constant0), dimensions={0}, to_apply=%add_f32
202 %mul2 = f32[1024,128] multiply(param2, param2)
203 ROOT %tuple =
204 (f32[1024, 128], f32[128], f32[128], f32[128], f32[128], f32[128], f32[1024, 128])
205 tuple(%mul2, %reduce0, %reduce4, %reduce3, %reduce2, %reduce1, %mul0)
206 }
207
208 ENTRY %cluster {
209 %param0 = f32[1024,128] parameter(0)
210 %param1 = f32[1024,128] parameter(1)
211 %param2 = f32[1024,128] parameter(2)
212 ROOT %fusion =
213 (f32[1024, 128], f32[128], f32[128], f32[128], f32[128], f32[128], f32[1024, 128])
214 fusion(%param0, %param1, %param2), kind=kInput, calls=%fused_computation
215 }
216 )";
217
218 TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> hlo_module,
219 ParseAndReturnVerifiedModule(hlo_text));
220 CompileAndVerifyIr(std::move(hlo_module),
221 R"(
222 CHECK: reduce-group-0
223 CHECK: reduce-group-1
224 CHECK: reduce-group-2
225 CHECK-NOT: reduce-group-3
226 )",
227 /*match_optimized_ir=*/false);
228 EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
229 }
230
231 } // namespace
232 } // namespace gpu
233 } // namespace xla
234