1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // Tests that we emit ld.global.nc (the PTX instruction corresponding to CUDA's
17 // __ldg builtin) for reads of buffers that don't change during a kernel's
18 // execution.
19 
20 #include <memory>
21 #include <utility>
22 
23 #include "tensorflow/compiler/xla/literal.h"
24 #include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
25 #include "tensorflow/compiler/xla/service/hlo_computation.h"
26 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
27 #include "tensorflow/compiler/xla/service/hlo_module.h"
28 #include "tensorflow/compiler/xla/shape_util.h"
29 #include "tensorflow/compiler/xla/xla_data.pb.h"
30 #include "tensorflow/core/lib/core/status_test_util.h"
31 #include "tensorflow/core/platform/test.h"
32 
33 namespace xla {
34 namespace gpu {
35 
36 class GpuLdgTest : public GpuCodegenTest {};
37 
38 // Parameters are never overwritten, so parameter reads should get ld.global.nc
39 // reads.
40 //
41 // On the ROCM platform the "ptx" string is not populated for the compiled
42 // executable, and hence the call to CompileAdnVerifyPtx does not do the
43 // "VerifyPtx" part, it merely compiles the executable
44 //
TEST_F(GpuLdgTest,LdgForParamRead)45 TEST_F(GpuLdgTest, LdgForParamRead) {
46   HloComputation::Builder builder(TestName());
47 
48   auto shape = ShapeUtil::MakeShape(F32, {2, 2});
49   HloInstruction* param =
50       builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "x"));
51   builder.AddInstruction(
52       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param, param));
53   std::unique_ptr<HloComputation> computation = builder.Build();
54 
55   auto hlo_module = CreateNewVerifiedModule();
56   hlo_module->AddEntryComputation(std::move(computation));
57 
58   CompileAndOptionallyVerifyPtx(std::move(hlo_module), R"(
59     CHECK-NOT: ld.global.f32
60     CHECK: ld.global.nc.f32
61   )");
62 }
63 
64 // Check that reading a buffer produced by a non-parameter HLO also results in
65 // ld.global.nc, if that buffer isn't modified within the instruction that reads
66 // it.
67 //
68 // On the ROCM platform the "ptx" string is not populated for the compiled
69 // executable, and hence the call to CompileAdnVerifyPtx does not do the
70 // "VerifyPtx" part, it merely compiles the executable
71 //
TEST_F(GpuLdgTest,LdgForNonParamRead)72 TEST_F(GpuLdgTest, LdgForNonParamRead) {
73   HloComputation::Builder builder(TestName());
74 
75   auto shape = ShapeUtil::MakeShape(F32, {2, 2});
76   HloInstruction* param =
77       builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "x"));
78   HloInstruction* add = builder.AddInstruction(
79       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param, param));
80   HloInstruction* square = builder.AddInstruction(
81       HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, add, add));
82   builder.AddInstruction(HloInstruction::CreateTuple({add, square}));
83   std::unique_ptr<HloComputation> computation = builder.Build();
84 
85   auto hlo_module = CreateNewVerifiedModule();
86   hlo_module->AddEntryComputation(std::move(computation));
87 
88   CompileAndOptionallyVerifyPtx(std::move(hlo_module), R"(
89     CHECK: {
90     CHECK-NOT: ld.global.f32
91     CHECK: ld.global.nc.f32
92     CHECK: }
93   )");
94 }
95 
96 // Check that reading a buffer that's modified in-place does not produce
97 // ld.global.nc.
98 //
99 // We do this by creating a reduce that feeds into a sin.  We don't currently
100 // fuse sin into reduce, and the sin is elementwise, so it reuses its input
101 // buffer as its output.
102 //
103 // It seems like a fair bet that we won't start fusing sin into the output of
104 // reduce in the foreseeable future.  But if that turns out to be wrong, I give
105 // you, future reader, permission to delete this test.
106 //
107 // On the ROCM platform the "ptx" string is not populated for the compiled
108 // executable, and hence the call to CompileAdnVerifyPtx does not do the
109 // "VerifyPtx" part, it merely compiles the executable
110 //
TEST_F(GpuLdgTest,NoLdgWhenSharingBuffer)111 TEST_F(GpuLdgTest, NoLdgWhenSharingBuffer) {
112   auto hlo_module = CreateNewVerifiedModule();
113   HloComputation::Builder builder(TestName());
114 
115   HloComputation* reduce_computation;
116   {
117     auto embedded_builder = HloComputation::Builder("add");
118     auto lhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter(
119         0, ShapeUtil::MakeShape(F32, {}), "lhs"));
120     auto rhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter(
121         1, ShapeUtil::MakeShape(F32, {}), "rhs"));
122     embedded_builder.AddInstruction(
123         HloInstruction::CreateBinary(lhs->shape(), HloOpcode::kAdd, lhs, rhs));
124     reduce_computation =
125         hlo_module->AddEmbeddedComputation(embedded_builder.Build());
126   }
127 
128   auto param_shape = ShapeUtil::MakeShape(F32, {32, 32});
129   auto reduce_shape = ShapeUtil::MakeShape(F32, {32});
130   HloInstruction* param = builder.AddInstruction(
131       HloInstruction::CreateParameter(0, param_shape, "x"));
132   HloInstruction* reduce = builder.AddInstruction(HloInstruction::CreateReduce(
133       reduce_shape,
134       builder.AddInstruction(HloInstruction::CreateBinary(
135           param_shape, HloOpcode::kAdd, param, param)),
136       builder.AddInstruction(
137           HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0))),
138       {0}, reduce_computation));
139   builder.AddInstruction(
140       HloInstruction::CreateUnary(reduce_shape, HloOpcode::kSin, reduce));
141 
142   std::unique_ptr<HloComputation> computation = builder.Build();
143   hlo_module->AddEntryComputation(std::move(computation));
144 
145   CompileAndOptionallyVerifyPtx(std::move(hlo_module), R"(
146     CHECK-LABEL: .entry sin
147     CHECK: {
148     CHECK-NOT: ld.global.nc.f32
149     CHECK: ld.global.f32
150     CHECK: }
151   )");
152 }
153 
154 }  // namespace gpu
155 }  // namespace xla
156