1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include <memory>
17 #include <utility>
18
19 #include "tensorflow/compiler/xla/literal.h"
20 #include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
21 #include "tensorflow/compiler/xla/service/hlo_computation.h"
22 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
23 #include "tensorflow/compiler/xla/service/hlo_module.h"
24 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
25 #include "tensorflow/compiler/xla/service/hlo_parser.h"
26 #include "tensorflow/compiler/xla/shape_util.h"
27 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
28 #include "tensorflow/compiler/xla/xla.pb.h"
29 #include "tensorflow/compiler/xla/xla_data.pb.h"
30 #include "tensorflow/core/platform/test.h"
31
32 namespace xla {
33 namespace gpu {
34
35 // This file tests the index expressions used to reference source tensors. When
36 // the destination tensor and source tensor have compatible shapes, the linear
37 // index is used to access the source tensor. Otherwise, dimensional indices
38 // computed from the linear index are used to access the source tensor.
39
40 class GpuIndexTest : public GpuCodegenTest {};
41
TEST_F(GpuIndexTest,CompatibleUseLinearIndex)42 TEST_F(GpuIndexTest, CompatibleUseLinearIndex) {
43 HloComputation::Builder builder(TestName());
44
45 auto param_shape = ShapeUtil::MakeShape(F32, {5, 7, 2});
46 HloInstruction* param_x = builder.AddInstruction(
47 HloInstruction::CreateParameter(0, param_shape, "x"));
48 HloInstruction* param_y = builder.AddInstruction(
49 HloInstruction::CreateParameter(1, param_shape, "y"));
50 builder.AddInstruction(HloInstruction::CreateCompare(
51 ShapeUtil::MakeShape(PRED, {5, 7, 2}), param_x, param_y,
52 ComparisonDirection::kGe));
53
54 auto hlo_module = CreateNewVerifiedModule();
55 hlo_module->AddEntryComputation(builder.Build());
56
57 // Check the optimized IR as the unoptimized IR contains dead udiv and urem.
58 CompileAndVerifyIr(std::move(hlo_module),
59 R"(
60 ; CHECK-NOT: udiv
61 ; CHECK-NOT: urem
62 )",
63 /*match_optimized_ir=*/true);
64 }
65
TEST_F(GpuIndexTest,CompatibleUseLinearIndexWithReshape)66 TEST_F(GpuIndexTest, CompatibleUseLinearIndexWithReshape) {
67 HloModuleConfig config;
68 config.set_debug_options(HloTestBase::GetDebugOptionsForTest());
69 auto module = ParseAndReturnVerifiedModule(R"(
70 HloModule test_module
71
72 ENTRY CompatibleUseLinearIndexWithReshape {
73 x = f32[5,7,2]{2,1,0} parameter(0)
74 y = f32[5,14]{1,0} parameter(1)
75 reshape = f32[5,7,2]{2,1,0} reshape(y)
76 ROOT gte = pred[5,7,2]{2,1,0} compare(x, reshape), direction=GE
77 })",
78 config)
79 .ValueOrDie();
80
81 // Check the optimized IR as the unoptimized IR contains dead udiv and urem.
82 CompileAndVerifyIr(std::move(module),
83 R"(
84 ; CHECK-NOT: udiv
85 ; CHECK-NOT: urem
86 )",
87 /*match_optimized_ir=*/true);
88 }
89
TEST_F(GpuIndexTest,ReuseMultidimIndexWithTrivialReshapeAndNonContiguousBroadcast)90 TEST_F(GpuIndexTest,
91 ReuseMultidimIndexWithTrivialReshapeAndNonContiguousBroadcast) {
92 HloModuleConfig config;
93 config.set_debug_options(HloTestBase::GetDebugOptionsForTest());
94 auto module = ParseAndReturnVerifiedModule(R"(
95 HloModule test_module
96
97 ENTRY CompatibleUseLinearIndexWithReshape {
98 x = f32[1,7,2,5,3]{4,3,2,1,0} parameter(0)
99 y = f32[2,1,3]{2,1,0} parameter(1)
100 reshape = f32[1,2,3]{2,1,0} reshape(y)
101 broadcast = f32[1,7,2,5,3]{4,3,2,1,0} broadcast(reshape), dimensions={0,2,4}
102 ROOT gte = pred[1,7,2,5,3]{4,3,2,1,0} compare(x, broadcast), direction=GE
103 })",
104 config)
105 .ValueOrDie();
106 CompileAndVerifyIr(std::move(module),
107 R"(
108 ; CHECK: %[[tmp4:.*]] = udiv i32 %[[linear_index:.*]], 1
109 ; CHECK: %[[dim4:.*]] = urem i32 %[[tmp4]], 3
110 ; CHECK: %[[tmp3:.*]] = udiv i32 %[[linear_index]], 3
111 ; CHECK: %[[dim3:.*]] = urem i32 %[[tmp3]], 5
112 ; CHECK: %[[tmp2:.*]] = udiv i32 %[[linear_index]], 15
113 ; CHECK: %[[dim2:.*]] = urem i32 %[[tmp2]], 2
114 ; CHECK: %[[tmp1:.*]] = udiv i32 %[[linear_index]], 30
115 ; CHECK: %[[dim1:.*]] = urem i32 %[[tmp1]], 7
116 ; CHECK: %[[dim0:.*]] = udiv i32 %[[linear_index]], 210
117 ; CHECK: %{{.*}} = getelementptr inbounds [2 x [1 x [3 x float]]], ptr %{{.*}}, i32 0, i32 %[[dim2]], i32 0, i32 %[[dim4]]
118 )",
119 /*match_optimized_ir=*/false);
120 }
121
122 #if TENSORFLOW_USE_ROCM
123 #else
TEST_F(GpuIndexTest,CompatibleUseLinearIndexWithReshapeAndBroadcast)124 TEST_F(GpuIndexTest, CompatibleUseLinearIndexWithReshapeAndBroadcast) {
125 HloModuleConfig config;
126 config.set_debug_options(HloTestBase::GetDebugOptionsForTest());
127 auto module = ParseAndReturnVerifiedModule(R"(
128 HloModule test_module
129
130 ENTRY CompatibleUseLinearIndexWithReshape {
131 x = f32[5,7,2]{2,1,0} parameter(0)
132 y = f32[14]{0} parameter(1)
133 reshape = f32[7,2]{1,0} reshape(y)
134 broadcast = f32[5,7,2]{2,1,0} broadcast(reshape), dimensions={1,2}
135 ROOT gte = pred[5,7,2]{2,1,0} compare(x, broadcast), direction=GE
136 })",
137 config)
138 .ValueOrDie();
139
140 // Check the optimized IR reuses the linear index by calculating modulo 14.
141
142 // In the IR generated for AMDGPUs, we do not seem to have the
143 // the addrspace(1) attribute for the lines being checked by the following
144 // patterns.
145 // need to investigate why that is the case, and whether or not it is ok
146 CompileAndVerifyIr(std::move(module),
147 R"(
148 ; CHECK: %[[urem1:.*]] = urem i{{[0-9]*}} %[[linear_index:.*]], 14
149 ; CHECK: %[[idx1:.*]] = zext i{{[0-9]*}} %[[urem1]] to i64
150 ; CHECK: getelementptr inbounds float, ptr{{( addrspace\(1\))?}} %[[alloc:.*]], i64 %[[idx1]]
151 )",
152 /*match_optimized_ir=*/true);
153 }
154 #endif
155
TEST_F(GpuIndexTest,CompatibleUseLinearIndexWithSizeOneDimensions)156 TEST_F(GpuIndexTest, CompatibleUseLinearIndexWithSizeOneDimensions) {
157 HloModuleConfig config;
158 auto debug_options = HloTestBase::GetDebugOptionsForTest();
159 debug_options.set_xla_gpu_max_kernel_unroll_factor(1);
160 config.set_debug_options(debug_options);
161
162 auto module = ParseAndReturnVerifiedModule(R"(
163 HloModule test_module
164
165 ENTRY CompatibleUseLinearIndexWithSizeOneDimensions {
166 x = f32[1,1024,1,256]{3,2,1,0} parameter(0)
167 ROOT y = f16[1,1024,1,256]{2,3,1,0} convert(x)
168 })",
169 config)
170 .ValueOrDie();
171
172 // Check that the unoptimized IR reuses the linear index.
173 CompileAndVerifyIr(std::move(module),
174 R"(
175 ; CHECK-LABEL: @fusion
176 ; CHECK: udiv i32 %[[linear_index:.*]], 262144
177 ; CHECK: %[[ld_addr:.*]] = getelementptr inbounds float, ptr {{.*}}, i32 %[[linear_index]]
178 ; CHECK: load float, ptr %[[ld_addr]]
179 ; CHECK: %[[st_addr:.*]] = getelementptr inbounds half, ptr {{.*}}, i32 %[[linear_index]]
180 ; CHECK: store half {{.*}}, ptr %[[st_addr]]
181 )",
182 /*match_optimized_ir=*/false);
183 }
184
TEST_F(GpuIndexTest,CompatibleUseLinearIndexWithTranspose)185 TEST_F(GpuIndexTest, CompatibleUseLinearIndexWithTranspose) {
186 HloModuleConfig config;
187 auto debug_options = HloTestBase::GetDebugOptionsForTest();
188 debug_options.set_xla_gpu_max_kernel_unroll_factor(1);
189 config.set_debug_options(debug_options);
190
191 auto module = ParseAndReturnVerifiedModule(R"(
192 HloModule test_module
193
194 ENTRY CompatibleUseLinearIndexWithTranspose {
195 x = f32[2,1024,3,256]{3,2,1,0} parameter(0)
196 y = f32[1024,2,256,3]{2,3,0,1} parameter(1)
197 transpose = f32[1024,2,256,3]{3,2,1,0} transpose(x), dimensions={1,0,3,2}
198 ROOT gte = pred[1024,2,256,3]{2,3,0,1} compare(transpose, y), direction=GE
199 })",
200 config)
201 .ValueOrDie();
202 // Check the optimized IR contains no udiv and urem.
203 CompileAndVerifyIr(std::move(module),
204 R"(
205 ; CHECK-NOT: udiv
206 ; CHECK-NOT: urem
207 )",
208 /*match_optimized_ir=*/true);
209 }
210
211 } // namespace gpu
212 } // namespace xla
213