• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
17 #include "tensorflow/compiler/xla/service/hlo_parser.h"
18 #include "tensorflow/compiler/xla/tests/filecheck.h"
19 
20 namespace xla {
21 namespace gpu {
22 
23 namespace {
24 
25 class ParallelReductionTest : public GpuCodegenTest {
GetDebugOptionsForTest()26   DebugOptions GetDebugOptionsForTest() override {
27     DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
28     // The test contains a MOF fusion and the XLA optimizer passes
29     // don't like this.
30     debug_options.set_xla_disable_all_hlo_passes(true);
31     return debug_options;
32   }
33 };
34 
TEST_F(ParallelReductionTest,TwoParallelReductions)35 TEST_F(ParallelReductionTest, TwoParallelReductions) {
36   const char* hlo_text = R"(
37 HloModule TwoParallelReductions
38 
39 %add_f32 {
40   %x = f32[] parameter(0)
41   %y = f32[] parameter(1)
42   ROOT %add = f32[] add(%x, %y)
43 }
44 
45 %fused_computation {
46   %param0 = f32[1024] parameter(0)
47   %param1 = f32[1024] parameter(1)
48   %constant0 = f32[] constant(0)
49   %reduce1 = f32[] reduce(%param0, %constant0), dimensions={0}, to_apply=%add_f32
50   %reduce2 = f32[] reduce(%param1, %constant0), dimensions={0}, to_apply=%add_f32
51   ROOT %tuple = (f32[], f32[]) tuple(%reduce1, %reduce2)
52 }
53 
54 ENTRY %cluster {
55   %param0 = f32[1024] parameter(0)
56   %param1 = f32[1024] parameter(1)
57   ROOT %fusion = (f32[], f32[])
58       fusion(%param0, %param1), kind=kInput, calls=%fused_computation
59 }
60 )";
61 
62   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> hlo_module,
63                           ParseAndReturnVerifiedModule(hlo_text));
64   CompileAndVerifyIr(std::move(hlo_module),
65                      R"(
66 CHECK: reduce-group-0
67 CHECK: reduce-group-1
68 CHECK-NOT: reduce-group-2
69 )",
70                      /*match_optimized_ir=*/false);
71   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
72 }
73 
TEST_F(ParallelReductionTest,TwoParallelReductionsWithBroadcastOutput)74 TEST_F(ParallelReductionTest, TwoParallelReductionsWithBroadcastOutput) {
75   const char* hlo_text = R"(
76 HloModule TwoParallelReductions
77 
78 %add_f32 {
79   %x = f32[] parameter(0)
80   %y = f32[] parameter(1)
81   ROOT %add = f32[] add(%x, %y)
82 }
83 
84 %fused_computation {
85   %param0 = f32[] parameter(0)
86   %param1 = f32[] parameter(1)
87   %param2 = f32[] parameter(2)
88   %bcast0 = f32[1024] broadcast(f32[] %param0)
89   %reduce1 = f32[] reduce(%bcast0, %param1), dimensions={0}, to_apply=%add_f32
90   %reduce2 = f32[] reduce(%bcast0, %param2), dimensions={0}, to_apply=%add_f32
91   ROOT %tuple = (f32[], f32[], f32[1024]) tuple(%reduce1, %reduce2, %bcast0)
92 }
93 
94 ENTRY %cluster {
95   %param0 = f32[] parameter(0)
96   %param1 = f32[] parameter(1)
97   %param2 = f32[] parameter(2)
98   ROOT %fusion = (f32[], f32[], f32[1024])
99       fusion(%param0, %param1, %param2), kind=kInput, calls=%fused_computation
100 }
101 )";
102 
103   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> hlo_module,
104                           ParseAndReturnVerifiedModule(hlo_text));
105   CompileAndVerifyIr(std::move(hlo_module),
106                      R"(
107 CHECK: reduce-group-0
108 CHECK: reduce-group-1
109 CHECK-NOT: reduce-group-2
110 )",
111                      /*match_optimized_ir=*/false);
112   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
113 }
114 
TEST_F(ParallelReductionTest,ManyParallelReductions)115 TEST_F(ParallelReductionTest, ManyParallelReductions) {
116   std::unique_ptr<VerifiedHloModule> module = CreateNewVerifiedModule();
117   // Simply use a number not too large to avoid long compilation time
118   // and not too small for meaningful test.
119   const size_t num_reduces = 32;
120 
121   HloComputation* reduce_computation;
122   {
123     auto embedded_builder = HloComputation::Builder("add");
124     HloInstruction* lhs =
125         embedded_builder.AddInstruction(HloInstruction::CreateParameter(
126             0, ShapeUtil::MakeShape(F32, {}), "lhs"));
127     HloInstruction* rhs =
128         embedded_builder.AddInstruction(HloInstruction::CreateParameter(
129             1, ShapeUtil::MakeShape(F32, {}), "rhs"));
130     embedded_builder.AddInstruction(
131         HloInstruction::CreateBinary(lhs->shape(), HloOpcode::kAdd, lhs, rhs));
132     reduce_computation =
133         module->AddEmbeddedComputation(embedded_builder.Build());
134   }
135 
136   Shape input_shape = ShapeUtil::MakeShape(F32, {1024});
137   Shape output_shape = ShapeUtil::MakeShape(F32, {});
138   HloComputation* fusion_computation;
139   {
140     auto fusion_builder = HloComputation::Builder("fusion_computation");
141     std::vector<HloInstruction*> outputs;
142     HloInstruction* constant = fusion_builder.AddInstruction(
143         HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0)));
144     for (size_t i = 0; i < num_reduces; ++i) {
145       HloInstruction* param = fusion_builder.AddInstruction(
146           HloInstruction::CreateParameter(i, input_shape, "param"));
147       HloInstruction* output =
148           fusion_builder.AddInstruction(HloInstruction::CreateReduce(
149               output_shape, param, constant, {0}, reduce_computation));
150       outputs.push_back(output);
151     }
152     fusion_builder.AddInstruction(HloInstruction::CreateTuple(outputs));
153     fusion_computation = module->AddEmbeddedComputation(fusion_builder.Build());
154   }
155 
156   HloComputation::Builder b(TestName());
157   std::vector<HloInstruction*> entry_params;
158   std::vector<Shape> output_shapes;
159   entry_params.reserve(num_reduces);
160   output_shapes.reserve(num_reduces);
161   for (size_t i = 0; i < num_reduces; ++i) {
162     HloInstruction* param = b.AddInstruction(
163         HloInstruction::CreateParameter(i, input_shape, "param"));
164     entry_params.push_back(param);
165     output_shapes.push_back(output_shape);
166   }
167   b.AddInstruction(HloInstruction::CreateFusion(
168       ShapeUtil::MakeTupleShape(output_shapes),
169       HloInstruction::FusionKind::kInput, entry_params, fusion_computation));
170   module->AddEntryComputation(b.Build());
171 
172   EXPECT_TRUE(RunAndCompare(std::move(module), ErrorSpec{1e-5, 1e-5}));
173 }
174 
TEST_F(ParallelReductionTest,ThreeReductionGroups)175 TEST_F(ParallelReductionTest, ThreeReductionGroups) {
176   const char* hlo_text = R"(
177 HloModule ThreeReductionGroups
178 
179 %add_f32 {
180   %x = f32[] parameter(0)
181   %y = f32[] parameter(1)
182   ROOT %add = f32[] add(%x, %y)
183 }
184 
185 %fused_computation {
186   %param0 = f32[1024,128] parameter(0)
187   %param1 = f32[1024,128] parameter(1)
188   %param2 = f32[1024,128] parameter(2)
189   %constant0 = f32[] constant(0)
190   // %mul0, %reduce0, and %reduce1 should go into a group.
191   %broadcast0 = f32[1024,128] broadcast(%constant0), dimensions={}
192   %mul0 = f32[1024,128] multiply(param0, broadcast0)
193   %reduce0 = f32[128] reduce(%mul0, %constant0), dimensions={0}, to_apply=%add_f32
194   %reduce1 = f32[128] reduce(%param0, %constant0), dimensions={0}, to_apply=%add_f32
195   // %reduce2 and %reduce3 should go into another group.
196   %reduce2 = f32[128] reduce(%param1, %constant0), dimensions={0}, to_apply=%add_f32
197   %reduce3 = f32[128] reduce(%param1, %constant0), dimensions={0}, to_apply=%add_f32
198   // %reduce4 and %mul2 should go into the other group, although broadcast0 is
199   // reused.
200   %mul1 = f32[1024,128] multiply(param2, broadcast0)
201   %reduce4 = f32[128] reduce(%mul1, %constant0), dimensions={0}, to_apply=%add_f32
202   %mul2 = f32[1024,128] multiply(param2, param2)
203   ROOT %tuple =
204       (f32[1024, 128], f32[128], f32[128], f32[128], f32[128], f32[128], f32[1024, 128])
205       tuple(%mul2, %reduce0, %reduce4, %reduce3, %reduce2, %reduce1, %mul0)
206 }
207 
208 ENTRY %cluster {
209   %param0 = f32[1024,128] parameter(0)
210   %param1 = f32[1024,128] parameter(1)
211   %param2 = f32[1024,128] parameter(2)
212   ROOT %fusion =
213       (f32[1024, 128], f32[128], f32[128], f32[128], f32[128], f32[128], f32[1024, 128])
214       fusion(%param0, %param1, %param2), kind=kInput, calls=%fused_computation
215 }
216 )";
217 
218   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> hlo_module,
219                           ParseAndReturnVerifiedModule(hlo_text));
220   CompileAndVerifyIr(std::move(hlo_module),
221                      R"(
222 CHECK: reduce-group-0
223 CHECK: reduce-group-1
224 CHECK: reduce-group-2
225 CHECK-NOT: reduce-group-3
226 )",
227                      /*match_optimized_ir=*/false);
228   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
229 }
230 
231 }  // namespace
232 }  // namespace gpu
233 }  // namespace xla
234