• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include <utility>
17 
18 #include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
19 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
20 #include "tensorflow/compiler/xla/service/hlo_parser.h"
21 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
22 #include "tensorflow/core/platform/test.h"
23 
24 namespace xla {
25 namespace gpu {
26 namespace {
27 
28 class GpuUnrollingTest : public GpuCodegenTest {};
29 
30 const char *const kAddModule = R"(
31     HloModule test_module
32 
33     fused_computation {
34       p0.param_0 = f32[2,2]{1,0} parameter(0)
35       p1.param_1 = f32[2,2]{1,0} parameter(1)
36       ROOT add = f32[2,2] add(p0.param_0, p1.param_1)
37     }
38 
39     ENTRY BroadcastIntoAdd {
40       p0 = f32[2,2]{1,0} parameter(0)
41       p1 = f32[2,2]{1,0} parameter(1)
42       ROOT fusion = f32[2,2]{1,0} fusion(p0, p1), kind=kLoop,
43                                                   calls=fused_computation
44     })";
45 
TEST_F(GpuUnrollingTest,DoNotUnroll)46 TEST_F(GpuUnrollingTest, DoNotUnroll) {
47   HloModuleConfig config;
48   auto debug_options = HloTestBase::GetDebugOptionsForTest();
49   debug_options.set_xla_gpu_max_kernel_unroll_factor(1);
50   config.set_debug_options(debug_options);
51   auto hlo_module =
52       ParseAndReturnVerifiedModule(kAddModule, config).ValueOrDie();
53 
54   CompileAndVerifyIr(std::move(hlo_module),
55                      R"(
56 ; CHECK-LABEL: @fusion
57 ; CHECK: fadd
58 ; CHECK-NOT: fadd
59 ; CHECK: }
60       )",
61                      /*match_optimized_ir=*/true);
62 }
63 
TEST_F(GpuUnrollingTest,UnrollFourTimes)64 TEST_F(GpuUnrollingTest, UnrollFourTimes) {
65   HloModuleConfig config;
66   auto debug_options = HloTestBase::GetDebugOptionsForTest();
67   // We request a factor of 8, but the computation works on 4 elements, limiting
68   // the maximum unroll factor.
69   debug_options.set_xla_gpu_max_kernel_unroll_factor(8);
70   debug_options.set_xla_gpu_enable_mlir_lowering(false);
71   config.set_debug_options(debug_options);
72   auto hlo_module =
73       ParseAndReturnVerifiedModule(kAddModule, config).ValueOrDie();
74 
75   CompileAndVerifyIr(std::move(hlo_module),
76                      R"(
77 ; CHECK-LABEL: @fusion
78 ; CHECK: fadd
79 ; CHECK: fadd
80 ; CHECK: fadd
81 ; CHECK: fadd
82 ; CHECK-NOT: fadd
83 ; CHECK: }
84       )",
85                      /*match_optimized_ir=*/true);
86 }
87 
TEST_F(GpuUnrollingTest,UnrollDefaultTimes)88 TEST_F(GpuUnrollingTest, UnrollDefaultTimes) {
89   // The default unrolling factor is 4.
90   HloModuleConfig config;
91   auto debug_options = GetDebugOptionsFromFlags();
92   debug_options.set_xla_gpu_enable_mlir_lowering(false);
93   config.set_debug_options(debug_options);
94   auto hlo_module =
95       ParseAndReturnVerifiedModule(kAddModule, config).ValueOrDie();
96 
97   CompileAndVerifyIr(std::move(hlo_module),
98                      R"(
99 ; CHECK-LABEL: @fusion
100 ; CHECK: load <4 x float>
101 ; CHECK: fadd
102 ; CHECK: fadd
103 ; CHECK: fadd
104 ; CHECK: fadd
105 ; CHECK-NOT: fadd
106 ; CHECK: store <4 x float>
107 ; CHECK: }
108       )",
109                      /*match_optimized_ir=*/true);
110 }
111 
TEST_F(GpuUnrollingTest,UnrollUnfusedAdd)112 TEST_F(GpuUnrollingTest, UnrollUnfusedAdd) {
113   HloModuleConfig config;
114   auto debug_options = HloTestBase::GetDebugOptionsForTest();
115   debug_options.set_xla_gpu_max_kernel_unroll_factor(4);
116   debug_options.set_xla_gpu_enable_mlir_lowering(false);
117   config.set_debug_options(debug_options);
118 
119   const char *const kUnfusedAddModule = R"(
120     HloModule test_module
121 
122     ENTRY AddFunc {
123       p0 = f32[2,2]{1,0} parameter(0)
124       p1 = f32[2,2]{1,0} parameter(1)
125       ROOT add = f32[2,2]{1,0} add(p0, p1)
126     })";
127   auto hlo_module =
128       ParseAndReturnVerifiedModule(kUnfusedAddModule, config).ValueOrDie();
129 
130   CompileAndVerifyIr(std::move(hlo_module),
131                      R"(
132 ; CHECK-LABEL: @add
133 ; CHECK: load <4 x float>
134 ; CHECK: fadd
135 ; CHECK: fadd
136 ; CHECK: fadd
137 ; CHECK: fadd
138 ; CHECK-NOT: fadd
139 ; CHECK: store <4 x float>
140 ; CHECK: }
141       )",
142                      /*match_optimized_ir=*/true);
143 }
144 
TEST_F(GpuUnrollingTest,DisabledUnrollUnfusedSine)145 TEST_F(GpuUnrollingTest, DisabledUnrollUnfusedSine) {
146   HloModuleConfig config;
147   auto debug_options = HloTestBase::GetDebugOptionsForTest();
148   debug_options.set_xla_gpu_max_kernel_unroll_factor(4);
149   config.set_debug_options(debug_options);
150 
151   const char *const kUnfusedAddModule = R"(
152     HloModule test_module
153 
154     ENTRY SineFunc {
155       p0 = f32[1600000]{0} parameter(0)
156       ROOT s = f32[1600000]{0} sine(p0)
157     })";
158   auto hlo_module =
159       ParseAndReturnVerifiedModule(kUnfusedAddModule, config).ValueOrDie();
160 
161   // Note: On ROCm side, we do bare minimal to make the test pass.
162   // "sine" function is in different code generation path from nvptx: on
163   // ROCm platform, it get pulled in from ROCm-Device-Libs, whereas in
164   // Cuda, generated llvm IR is compiled PTX.
165   auto expected_ir = is_built_with_rocm_ ? R"(
166 ; CHECK: __ocml_sin_f32
167 ; CHECK-NOT: load float
168 )"
169                                          : R"(
170 ; CHECK: load float
171 ; CHECK-NOT: load float
172 }
173 )";
174 
175   CompileAndVerifyIr(std::move(hlo_module), expected_ir,
176                      /*match_optimized_ir=*/true);
177 }
178 
TEST_F(GpuUnrollingTest,DisabledUnrollUnfusedCosine)179 TEST_F(GpuUnrollingTest, DisabledUnrollUnfusedCosine) {
180   HloModuleConfig config;
181   auto debug_options = HloTestBase::GetDebugOptionsForTest();
182   debug_options.set_xla_gpu_max_kernel_unroll_factor(4);
183   config.set_debug_options(debug_options);
184 
185   const char *const kUnfusedAddModule = R"(
186     HloModule test_module
187 
188     ENTRY SineFunc {
189       p0 = f32[1600000]{0} parameter(0)
190       ROOT s = f32[1600000]{0} cosine(p0)
191     })";
192   auto hlo_module =
193       ParseAndReturnVerifiedModule(kUnfusedAddModule, config).ValueOrDie();
194 
195   // Note: On ROCm side, we do bare minimal to make the test pass.
196   // "cosine" function is in different code generation path from nvptx: on
197   // ROCm platform, it get pulled in from ROCm-Device-Libs, whereas in
198   // Cuda, generated llvm IR is compiled PTX.
199   auto expected_ir = is_built_with_rocm_ ? R"(
200 ; CHECK: __ocml_cos_f32
201 ; CHECK-NOT: load float
202 )"
203                                          : R"(
204 ; CHECK: load float
205 ; CHECK-NOT: load float
206 }
207 )";
208 
209   CompileAndVerifyIr(std::move(hlo_module), expected_ir,
210                      /*match_optimized_ir=*/true);
211 }
212 
TEST_F(GpuUnrollingTest,DisabledUnrollUnfusedPower)213 TEST_F(GpuUnrollingTest, DisabledUnrollUnfusedPower) {
214   HloModuleConfig config;
215   auto debug_options = HloTestBase::GetDebugOptionsForTest();
216   debug_options.set_xla_gpu_max_kernel_unroll_factor(4);
217   config.set_debug_options(debug_options);
218 
219   const char *const kUnfusedAddModule = R"(
220     HloModule test_module
221 
222     ENTRY SineFunc {
223       p0 = f32[1600000]{0} parameter(0)
224       ROOT s = f32[1600000]{0} power(p0, p0)
225     })";
226   auto hlo_module =
227       ParseAndReturnVerifiedModule(kUnfusedAddModule, config).ValueOrDie();
228 
229   CompileAndVerifyIr(std::move(hlo_module),
230                      R"(
231 ; CHECK: load float
232 ; CHECK-NOT: load float
233 }
234       )",
235                      /*match_optimized_ir=*/true);
236 }
237 
TEST_F(GpuUnrollingTest,DisabledUnrollUnfusedAtan2)238 TEST_F(GpuUnrollingTest, DisabledUnrollUnfusedAtan2) {
239   HloModuleConfig config;
240   auto debug_options = HloTestBase::GetDebugOptionsForTest();
241   debug_options.set_xla_gpu_max_kernel_unroll_factor(4);
242   config.set_debug_options(debug_options);
243 
244   const char *const kUnfusedAddModule = R"(
245     HloModule test_module
246 
247     ENTRY SineFunc {
248       p0 = f32[16000000]{0} parameter(0)
249       ROOT s = f32[16000000]{0} atan2(p0, p0)
250     })";
251   auto hlo_module =
252       ParseAndReturnVerifiedModule(kUnfusedAddModule, config).ValueOrDie();
253 
254   CompileAndVerifyIr(std::move(hlo_module),
255                      R"(
256 ; CHECK: load float
257 ; CHECK-NOT: load float
258 }
259       )",
260                      /*match_optimized_ir=*/true);
261 }
262 
TEST_F(GpuUnrollingTest,UnrollMultiOutputFusion)263 TEST_F(GpuUnrollingTest, UnrollMultiOutputFusion) {
264   HloModuleConfig config;
265   auto debug_options = HloTestBase::GetDebugOptionsForTest();
266   debug_options.set_xla_gpu_max_kernel_unroll_factor(2);
267   // Disable layout assignment for this test.  Layout assignment does not expect
268   // fusions to be present, and so it does the wrong thing.
269   debug_options.add_xla_disable_hlo_passes("layout-assignment");
270   config.set_debug_options(debug_options);
271 
272   const char *const kMultiOutputFusionModule = R"(
273     HloModule test_module
274 
275     fused_computation {
276       p0.param_0 = f32[2,2]{1,0} parameter(0)
277       p1.param_1 = f32[2,2]{1,0} parameter(1)
278       add = f32[2,2]{1,0} add(p0.param_0, p1.param_1)
279       mul = f32[2,2]{1,0} multiply(p0.param_0, p1.param_1)
280       ROOT tuple = (f32[2,2]{1,0}, f32[2,2]{1,0}) tuple(add, mul)
281     }
282 
283     ENTRY BroadcastIntoAdd {
284       p0 = f32[2,2]{1,0} parameter(0)
285       p1 = f32[2,2]{1,0} parameter(1)
286       ROOT fusion = (f32[2,2]{1,0}, f32[2,2]{1,0}) fusion(p0, p1), kind=kLoop,
287                                                    calls=fused_computation
288     })";
289   auto hlo_module =
290       ParseAndReturnVerifiedModule(kMultiOutputFusionModule, config)
291           .ValueOrDie();
292 
293   CompileAndVerifyIr(std::move(hlo_module),
294                      R"(
295 ; CHECK-LABEL: @fusion
296 ; CHECK: load <2 x float>
297 ; CHECK: load <2 x float>
298 ; CHECK-NOT: load <2 x float>
299 ; CHECK: fadd
300 ; CHECK: fmul
301 ; CHECK: fadd
302 ; CHECK: fmul
303 ; CHECK: store <2 x float>
304 ; CHECK: store <2 x float>
305 ; CHECK-NOT: store <2 x float>
306 ; CHECK-NOT: fadd
307 ; CHECK-NOT: fmul
308 ; CHECK: }
309       )",
310                      /*match_optimized_ir=*/true);
311 }
312 
313 }  // namespace
314 }  // namespace gpu
315 }  // namespace xla
316