1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 // Tests that we emit ld.global.nc (the PTX instruction corresponding to CUDA's
17 // __ldg builtin) for reads of buffers that don't change during a kernel's
18 // execution.
19
20 #include <memory>
21 #include <utility>
22
23 #include "tensorflow/compiler/xla/literal.h"
24 #include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
25 #include "tensorflow/compiler/xla/service/hlo_computation.h"
26 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
27 #include "tensorflow/compiler/xla/service/hlo_module.h"
28 #include "tensorflow/compiler/xla/shape_util.h"
29 #include "tensorflow/compiler/xla/xla_data.pb.h"
30 #include "tensorflow/core/lib/core/status_test_util.h"
31 #include "tensorflow/core/platform/test.h"
32
33 namespace xla {
34 namespace gpu {
35
36 class GpuLdgTest : public GpuCodegenTest {};
37
38 // Parameters are never overwritten, so parameter reads should get ld.global.nc
39 // reads.
40 //
41 // On the ROCM platform the "ptx" string is not populated for the compiled
42 // executable, and hence the call to CompileAdnVerifyPtx does not do the
43 // "VerifyPtx" part, it merely compiles the executable
44 //
TEST_F(GpuLdgTest,LdgForParamRead)45 TEST_F(GpuLdgTest, LdgForParamRead) {
46 HloComputation::Builder builder(TestName());
47
48 auto shape = ShapeUtil::MakeShape(F32, {2, 2});
49 HloInstruction* param =
50 builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "x"));
51 builder.AddInstruction(
52 HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param, param));
53 std::unique_ptr<HloComputation> computation = builder.Build();
54
55 auto hlo_module = CreateNewVerifiedModule();
56 hlo_module->AddEntryComputation(std::move(computation));
57
58 CompileAndOptionallyVerifyPtx(std::move(hlo_module), R"(
59 CHECK-NOT: ld.global.f32
60 CHECK: ld.global.nc.f32
61 )");
62 }
63
64 // Check that reading a buffer produced by a non-parameter HLO also results in
65 // ld.global.nc, if that buffer isn't modified within the instruction that reads
66 // it.
67 //
68 // On the ROCM platform the "ptx" string is not populated for the compiled
69 // executable, and hence the call to CompileAdnVerifyPtx does not do the
70 // "VerifyPtx" part, it merely compiles the executable
71 //
TEST_F(GpuLdgTest,LdgForNonParamRead)72 TEST_F(GpuLdgTest, LdgForNonParamRead) {
73 HloComputation::Builder builder(TestName());
74
75 auto shape = ShapeUtil::MakeShape(F32, {2, 2});
76 HloInstruction* param =
77 builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "x"));
78 HloInstruction* add = builder.AddInstruction(
79 HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param, param));
80 HloInstruction* square = builder.AddInstruction(
81 HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, add, add));
82 builder.AddInstruction(HloInstruction::CreateTuple({add, square}));
83 std::unique_ptr<HloComputation> computation = builder.Build();
84
85 auto hlo_module = CreateNewVerifiedModule();
86 hlo_module->AddEntryComputation(std::move(computation));
87
88 CompileAndOptionallyVerifyPtx(std::move(hlo_module), R"(
89 CHECK: {
90 CHECK-NOT: ld.global.f32
91 CHECK: ld.global.nc.f32
92 CHECK: }
93 )");
94 }
95
96 // Check that reading a buffer that's modified in-place does not produce
97 // ld.global.nc.
98 //
99 // We do this by creating a reduce that feeds into a sin. We don't currently
100 // fuse sin into reduce, and the sin is elementwise, so it reuses its input
101 // buffer as its output.
102 //
103 // It seems like a fair bet that we won't start fusing sin into the output of
104 // reduce in the foreseeable future. But if that turns out to be wrong, I give
105 // you, future reader, permission to delete this test.
106 //
107 // On the ROCM platform the "ptx" string is not populated for the compiled
108 // executable, and hence the call to CompileAdnVerifyPtx does not do the
109 // "VerifyPtx" part, it merely compiles the executable
110 //
TEST_F(GpuLdgTest,NoLdgWhenSharingBuffer)111 TEST_F(GpuLdgTest, NoLdgWhenSharingBuffer) {
112 auto hlo_module = CreateNewVerifiedModule();
113 HloComputation::Builder builder(TestName());
114
115 HloComputation* reduce_computation;
116 {
117 auto embedded_builder = HloComputation::Builder("add");
118 auto lhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter(
119 0, ShapeUtil::MakeShape(F32, {}), "lhs"));
120 auto rhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter(
121 1, ShapeUtil::MakeShape(F32, {}), "rhs"));
122 embedded_builder.AddInstruction(
123 HloInstruction::CreateBinary(lhs->shape(), HloOpcode::kAdd, lhs, rhs));
124 reduce_computation =
125 hlo_module->AddEmbeddedComputation(embedded_builder.Build());
126 }
127
128 auto param_shape = ShapeUtil::MakeShape(F32, {32, 32});
129 auto reduce_shape = ShapeUtil::MakeShape(F32, {32});
130 HloInstruction* param = builder.AddInstruction(
131 HloInstruction::CreateParameter(0, param_shape, "x"));
132 HloInstruction* reduce = builder.AddInstruction(HloInstruction::CreateReduce(
133 reduce_shape,
134 builder.AddInstruction(HloInstruction::CreateBinary(
135 param_shape, HloOpcode::kAdd, param, param)),
136 builder.AddInstruction(
137 HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0))),
138 {0}, reduce_computation));
139 builder.AddInstruction(
140 HloInstruction::CreateUnary(reduce_shape, HloOpcode::kSin, reduce));
141
142 std::unique_ptr<HloComputation> computation = builder.Build();
143 hlo_module->AddEntryComputation(std::move(computation));
144
145 CompileAndOptionallyVerifyPtx(std::move(hlo_module), R"(
146 CHECK-LABEL: .entry sin
147 CHECK: {
148 CHECK-NOT: ld.global.nc.f32
149 CHECK: ld.global.f32
150 CHECK: }
151 )");
152 }
153
154 } // namespace gpu
155 } // namespace xla
156