1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include <utility>
17
18 #include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
19 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
20 #include "tensorflow/compiler/xla/service/hlo_parser.h"
21 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
22 #include "tensorflow/core/platform/test.h"
23
24 namespace xla {
25 namespace gpu {
26 namespace {
27
28 class GpuUnrollingTest : public GpuCodegenTest {};
29
30 const char *const kAddModule = R"(
31 HloModule test_module
32
33 fused_computation {
34 p0.param_0 = f32[2,2]{1,0} parameter(0)
35 p1.param_1 = f32[2,2]{1,0} parameter(1)
36 ROOT add = f32[2,2] add(p0.param_0, p1.param_1)
37 }
38
39 ENTRY BroadcastIntoAdd {
40 p0 = f32[2,2]{1,0} parameter(0)
41 p1 = f32[2,2]{1,0} parameter(1)
42 ROOT fusion = f32[2,2]{1,0} fusion(p0, p1), kind=kLoop,
43 calls=fused_computation
44 })";
45
TEST_F(GpuUnrollingTest,DoNotUnroll)46 TEST_F(GpuUnrollingTest, DoNotUnroll) {
47 HloModuleConfig config;
48 auto debug_options = HloTestBase::GetDebugOptionsForTest();
49 debug_options.set_xla_gpu_max_kernel_unroll_factor(1);
50 config.set_debug_options(debug_options);
51 auto hlo_module =
52 ParseAndReturnVerifiedModule(kAddModule, config).ValueOrDie();
53
54 CompileAndVerifyIr(std::move(hlo_module),
55 R"(
56 ; CHECK-LABEL: @fusion
57 ; CHECK: fadd
58 ; CHECK-NOT: fadd
59 ; CHECK: }
60 )",
61 /*match_optimized_ir=*/true);
62 }
63
TEST_F(GpuUnrollingTest,UnrollFourTimes)64 TEST_F(GpuUnrollingTest, UnrollFourTimes) {
65 HloModuleConfig config;
66 auto debug_options = HloTestBase::GetDebugOptionsForTest();
67 // We request a factor of 8, but the computation works on 4 elements, limiting
68 // the maximum unroll factor.
69 debug_options.set_xla_gpu_max_kernel_unroll_factor(8);
70 debug_options.set_xla_gpu_enable_mlir_lowering(false);
71 config.set_debug_options(debug_options);
72 auto hlo_module =
73 ParseAndReturnVerifiedModule(kAddModule, config).ValueOrDie();
74
75 CompileAndVerifyIr(std::move(hlo_module),
76 R"(
77 ; CHECK-LABEL: @fusion
78 ; CHECK: fadd
79 ; CHECK: fadd
80 ; CHECK: fadd
81 ; CHECK: fadd
82 ; CHECK-NOT: fadd
83 ; CHECK: }
84 )",
85 /*match_optimized_ir=*/true);
86 }
87
TEST_F(GpuUnrollingTest,UnrollDefaultTimes)88 TEST_F(GpuUnrollingTest, UnrollDefaultTimes) {
89 // The default unrolling factor is 4.
90 HloModuleConfig config;
91 auto debug_options = GetDebugOptionsFromFlags();
92 debug_options.set_xla_gpu_enable_mlir_lowering(false);
93 config.set_debug_options(debug_options);
94 auto hlo_module =
95 ParseAndReturnVerifiedModule(kAddModule, config).ValueOrDie();
96
97 CompileAndVerifyIr(std::move(hlo_module),
98 R"(
99 ; CHECK-LABEL: @fusion
100 ; CHECK: load <4 x float>
101 ; CHECK: fadd
102 ; CHECK: fadd
103 ; CHECK: fadd
104 ; CHECK: fadd
105 ; CHECK-NOT: fadd
106 ; CHECK: store <4 x float>
107 ; CHECK: }
108 )",
109 /*match_optimized_ir=*/true);
110 }
111
TEST_F(GpuUnrollingTest,UnrollUnfusedAdd)112 TEST_F(GpuUnrollingTest, UnrollUnfusedAdd) {
113 HloModuleConfig config;
114 auto debug_options = HloTestBase::GetDebugOptionsForTest();
115 debug_options.set_xla_gpu_max_kernel_unroll_factor(4);
116 debug_options.set_xla_gpu_enable_mlir_lowering(false);
117 config.set_debug_options(debug_options);
118
119 const char *const kUnfusedAddModule = R"(
120 HloModule test_module
121
122 ENTRY AddFunc {
123 p0 = f32[2,2]{1,0} parameter(0)
124 p1 = f32[2,2]{1,0} parameter(1)
125 ROOT add = f32[2,2]{1,0} add(p0, p1)
126 })";
127 auto hlo_module =
128 ParseAndReturnVerifiedModule(kUnfusedAddModule, config).ValueOrDie();
129
130 CompileAndVerifyIr(std::move(hlo_module),
131 R"(
132 ; CHECK-LABEL: @add
133 ; CHECK: load <4 x float>
134 ; CHECK: fadd
135 ; CHECK: fadd
136 ; CHECK: fadd
137 ; CHECK: fadd
138 ; CHECK-NOT: fadd
139 ; CHECK: store <4 x float>
140 ; CHECK: }
141 )",
142 /*match_optimized_ir=*/true);
143 }
144
TEST_F(GpuUnrollingTest,DisabledUnrollUnfusedSine)145 TEST_F(GpuUnrollingTest, DisabledUnrollUnfusedSine) {
146 HloModuleConfig config;
147 auto debug_options = HloTestBase::GetDebugOptionsForTest();
148 debug_options.set_xla_gpu_max_kernel_unroll_factor(4);
149 config.set_debug_options(debug_options);
150
151 const char *const kUnfusedAddModule = R"(
152 HloModule test_module
153
154 ENTRY SineFunc {
155 p0 = f32[1600000]{0} parameter(0)
156 ROOT s = f32[1600000]{0} sine(p0)
157 })";
158 auto hlo_module =
159 ParseAndReturnVerifiedModule(kUnfusedAddModule, config).ValueOrDie();
160
161 // Note: On ROCm side, we do bare minimal to make the test pass.
162 // "sine" function is in different code generation path from nvptx: on
163 // ROCm platform, it get pulled in from ROCm-Device-Libs, whereas in
164 // Cuda, generated llvm IR is compiled PTX.
165 auto expected_ir = is_built_with_rocm_ ? R"(
166 ; CHECK: __ocml_sin_f32
167 ; CHECK-NOT: load float
168 )"
169 : R"(
170 ; CHECK: load float
171 ; CHECK-NOT: load float
172 }
173 )";
174
175 CompileAndVerifyIr(std::move(hlo_module), expected_ir,
176 /*match_optimized_ir=*/true);
177 }
178
TEST_F(GpuUnrollingTest,DisabledUnrollUnfusedCosine)179 TEST_F(GpuUnrollingTest, DisabledUnrollUnfusedCosine) {
180 HloModuleConfig config;
181 auto debug_options = HloTestBase::GetDebugOptionsForTest();
182 debug_options.set_xla_gpu_max_kernel_unroll_factor(4);
183 config.set_debug_options(debug_options);
184
185 const char *const kUnfusedAddModule = R"(
186 HloModule test_module
187
188 ENTRY SineFunc {
189 p0 = f32[1600000]{0} parameter(0)
190 ROOT s = f32[1600000]{0} cosine(p0)
191 })";
192 auto hlo_module =
193 ParseAndReturnVerifiedModule(kUnfusedAddModule, config).ValueOrDie();
194
195 // Note: On ROCm side, we do bare minimal to make the test pass.
196 // "cosine" function is in different code generation path from nvptx: on
197 // ROCm platform, it get pulled in from ROCm-Device-Libs, whereas in
198 // Cuda, generated llvm IR is compiled PTX.
199 auto expected_ir = is_built_with_rocm_ ? R"(
200 ; CHECK: __ocml_cos_f32
201 ; CHECK-NOT: load float
202 )"
203 : R"(
204 ; CHECK: load float
205 ; CHECK-NOT: load float
206 }
207 )";
208
209 CompileAndVerifyIr(std::move(hlo_module), expected_ir,
210 /*match_optimized_ir=*/true);
211 }
212
TEST_F(GpuUnrollingTest,DisabledUnrollUnfusedPower)213 TEST_F(GpuUnrollingTest, DisabledUnrollUnfusedPower) {
214 HloModuleConfig config;
215 auto debug_options = HloTestBase::GetDebugOptionsForTest();
216 debug_options.set_xla_gpu_max_kernel_unroll_factor(4);
217 config.set_debug_options(debug_options);
218
219 const char *const kUnfusedAddModule = R"(
220 HloModule test_module
221
222 ENTRY SineFunc {
223 p0 = f32[1600000]{0} parameter(0)
224 ROOT s = f32[1600000]{0} power(p0, p0)
225 })";
226 auto hlo_module =
227 ParseAndReturnVerifiedModule(kUnfusedAddModule, config).ValueOrDie();
228
229 CompileAndVerifyIr(std::move(hlo_module),
230 R"(
231 ; CHECK: load float
232 ; CHECK-NOT: load float
233 }
234 )",
235 /*match_optimized_ir=*/true);
236 }
237
TEST_F(GpuUnrollingTest,DisabledUnrollUnfusedAtan2)238 TEST_F(GpuUnrollingTest, DisabledUnrollUnfusedAtan2) {
239 HloModuleConfig config;
240 auto debug_options = HloTestBase::GetDebugOptionsForTest();
241 debug_options.set_xla_gpu_max_kernel_unroll_factor(4);
242 config.set_debug_options(debug_options);
243
244 const char *const kUnfusedAddModule = R"(
245 HloModule test_module
246
247 ENTRY SineFunc {
248 p0 = f32[16000000]{0} parameter(0)
249 ROOT s = f32[16000000]{0} atan2(p0, p0)
250 })";
251 auto hlo_module =
252 ParseAndReturnVerifiedModule(kUnfusedAddModule, config).ValueOrDie();
253
254 CompileAndVerifyIr(std::move(hlo_module),
255 R"(
256 ; CHECK: load float
257 ; CHECK-NOT: load float
258 }
259 )",
260 /*match_optimized_ir=*/true);
261 }
262
TEST_F(GpuUnrollingTest,UnrollMultiOutputFusion)263 TEST_F(GpuUnrollingTest, UnrollMultiOutputFusion) {
264 HloModuleConfig config;
265 auto debug_options = HloTestBase::GetDebugOptionsForTest();
266 debug_options.set_xla_gpu_max_kernel_unroll_factor(2);
267 // Disable layout assignment for this test. Layout assignment does not expect
268 // fusions to be present, and so it does the wrong thing.
269 debug_options.add_xla_disable_hlo_passes("layout-assignment");
270 config.set_debug_options(debug_options);
271
272 const char *const kMultiOutputFusionModule = R"(
273 HloModule test_module
274
275 fused_computation {
276 p0.param_0 = f32[2,2]{1,0} parameter(0)
277 p1.param_1 = f32[2,2]{1,0} parameter(1)
278 add = f32[2,2]{1,0} add(p0.param_0, p1.param_1)
279 mul = f32[2,2]{1,0} multiply(p0.param_0, p1.param_1)
280 ROOT tuple = (f32[2,2]{1,0}, f32[2,2]{1,0}) tuple(add, mul)
281 }
282
283 ENTRY BroadcastIntoAdd {
284 p0 = f32[2,2]{1,0} parameter(0)
285 p1 = f32[2,2]{1,0} parameter(1)
286 ROOT fusion = (f32[2,2]{1,0}, f32[2,2]{1,0}) fusion(p0, p1), kind=kLoop,
287 calls=fused_computation
288 })";
289 auto hlo_module =
290 ParseAndReturnVerifiedModule(kMultiOutputFusionModule, config)
291 .ValueOrDie();
292
293 CompileAndVerifyIr(std::move(hlo_module),
294 R"(
295 ; CHECK-LABEL: @fusion
296 ; CHECK: load <2 x float>
297 ; CHECK: load <2 x float>
298 ; CHECK-NOT: load <2 x float>
299 ; CHECK: fadd
300 ; CHECK: fmul
301 ; CHECK: fadd
302 ; CHECK: fmul
303 ; CHECK: store <2 x float>
304 ; CHECK: store <2 x float>
305 ; CHECK-NOT: store <2 x float>
306 ; CHECK-NOT: fadd
307 ; CHECK-NOT: fmul
308 ; CHECK: }
309 )",
310 /*match_optimized_ir=*/true);
311 }
312
313 } // namespace
314 } // namespace gpu
315 } // namespace xla
316