1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "llvm/Transforms/Utils/Cloning.h"
17 #include "mlir/Dialect/StandardOps/IR/Ops.h" // from @llvm-project
18 #include "mlir/Target/LLVMIR.h" // from @llvm-project
19 #include "mlir/Target/LLVMIR/Export.h" // from @llvm-project
20 #include "mlir/Transforms/DialectConversion.h" // from @llvm-project
21 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
22 #include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
23 #include "tensorflow/compiler/xla/debug_options_flags.h"
24 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
25 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
26 #include "tensorflow/compiler/xla/service/gpu/target_constants.h"
27 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
28 #include "tensorflow/compiler/xla/status.h"
29 #include "tensorflow/compiler/xla/statusor.h"
30 #include "tensorflow/core/platform/cuda_libdevice_path.h"
31 #include "tensorflow/core/platform/logging.h"
32 #include "tensorflow/core/platform/path.h"
33
34 #if GOOGLE_CUDA
35 #include "tensorflow/stream_executor/gpu/asm_compiler.h"
36 #elif TENSORFLOW_USE_ROCM
37 #include "tensorflow/core/platform/rocm_rocdl_path.h"
38 #include "tensorflow/stream_executor/gpu/asm_compiler.h"
39 #endif
40
41 namespace mlir {
42 namespace kernel_gen {
43 namespace transforms {
44 namespace {
45
46 #define GEN_PASS_CLASSES
47 #include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/kernel_gen_passes.h.inc"
48
49 using xla::InternalError;
50
51 class GpuKernelToBlobPass
52 : public GpuKernelToBlobPassBase<GpuKernelToBlobPass> {
53 public:
GpuKernelToBlobPass(mlir::StringRef blob_annotation,llvm::ArrayRef<std::string> architectures,bool generate_fatbin,bool print_ptx,bool enable_ftz)54 GpuKernelToBlobPass(mlir::StringRef blob_annotation,
55 llvm::ArrayRef<std::string> architectures,
56 bool generate_fatbin, bool print_ptx, bool enable_ftz) {
57 if (!blob_annotation.empty()) {
58 blob_annotation_ = blob_annotation.str();
59 }
60 architectures_ = architectures;
61 generate_fatbin_ = generate_fatbin;
62 print_ptx_ = print_ptx;
63 enable_ftz_ = enable_ftz;
64 }
65
runOnOperation()66 void runOnOperation() override {
67 mlir::gpu::GPUModuleOp gpu_module = getOperation();
68 auto blob_or = GetGpuBinaryBlob(gpu_module);
69 if (blob_or.ok()) {
70 const auto& blob = blob_or.ValueOrDie();
71 std::string blob_string(blob.begin(), blob.end());
72 gpu_module->setAttr(blob_annotation_,
73 mlir::StringAttr::get(&getContext(), blob_string));
74 return;
75 }
76 // Forward the error by attaching the message to the gpu module.
77 gpu_module.emitError(blob_or.status().error_message());
78 return signalPassFailure();
79 }
80
GetGpuBinaryBlob(mlir::gpu::GPUModuleOp gpu_module)81 xla::StatusOr<std::vector<uint8_t>> GetGpuBinaryBlob(
82 mlir::gpu::GPUModuleOp gpu_module) {
83 if (architectures_.empty()) {
84 return InternalError("Expected at least one GPU architecture.");
85 }
86 if (!generate_fatbin_ && architectures_.size() > 1) {
87 return InternalError(
88 "Can only generate machine code for more than one architecture as a "
89 "fatbin.");
90 }
91
92 llvm::LLVMContext llvmContext;
93 auto llvmModule = mlir::translateModuleToLLVMIR(gpu_module, llvmContext);
94
95 #if TENSORFLOW_USE_ROCM
96 if (!llvmModule) {
97 return InternalError("Could not translate MLIR module to ROCDL IR");
98 }
99
100 llvmModule->setModuleIdentifier("acme");
101
102 xla::HloModuleConfig config;
103 xla::DebugOptions options = xla::GetDebugOptionsFromFlags();
104 options.set_xla_gpu_ftz(enable_ftz_);
105 config.set_debug_options(options);
106
107 using AmdGpuHsaco = std::vector<tensorflow::uint8>;
108 std::vector<tensorflow::se::HsacoImage> images;
109 for (const std::string& arch_str : architectures_) {
110 // Parse ROCm architecture.
111 absl::string_view consumable_arch(arch_str);
112 if (!absl::ConsumePrefix(&consumable_arch, "gfx")) {
113 return InternalError(
114 "Could not parse ROCm architecture prefix (expected gfx)");
115 }
116 uint32_t arch;
117 if (!absl::SimpleAtoi(consumable_arch, &arch)) {
118 return InternalError("Could not parse ROCm architecture number");
119 }
120
121 std::string libdevice_dir = tensorflow::RocdlRoot();
122 auto llvm_module_copy = llvm::CloneModule(*llvmModule);
123 xla::gpu::GpuVersion gpu_version{std::make_pair(arch, arch_str)};
124 auto hsaco_or = xla::gpu::amdgpu::CompileToHsaco(
125 llvm_module_copy.get(), gpu_version, config, libdevice_dir);
126 if (!hsaco_or.ok()) {
127 return InternalError("Failure when generating HSACO");
128 }
129
130 auto hsaco = hsaco_or.ValueOrDie();
131 if (!generate_fatbin_) {
132 // Skip fatbin generation and return the first and only GPU machine
133 // code. This is currently only used for `tf_to_gpu_binary` and will
134 // eventually disappear.
135 return hsaco;
136 }
137
138 images.push_back({arch_str, std::move(hsaco)});
139 }
140
141 // TODO(b/169870789): Revisit the use of fatbins.
142 // Bundle HSACO images into a single fatbin.
143 return tensorflow::se::BundleGpuAsm(images, tensorflow::RocmRoot());
144
145 #elif GOOGLE_CUDA
146 if (!llvmModule) {
147 return InternalError("Could not translate MLIR module to NVVM");
148 }
149
150 llvmModule->setModuleIdentifier("acme");
151 llvmModule->setDataLayout(xla::gpu::nvptx::kDataLayout);
152
153 xla::HloModuleConfig config;
154 xla::DebugOptions options = xla::GetDebugOptionsFromFlags();
155 options.set_xla_gpu_ftz(enable_ftz_);
156 config.set_debug_options(options);
157
158 auto enable_fusion = [](llvm::TargetMachine* target) {
159 target->Options.AllowFPOpFusion = llvm::FPOpFusion::FPOpFusionMode::Fast;
160 };
161
162 // Compile and collect requested cubin and PTX images.
163 std::vector<tensorflow::se::CubinOrPTXImage> images;
164 TF_ASSIGN_OR_RETURN(std::string libdevice_dir, GetLibdeviceDir(config));
165 auto gpu_asm_opts = xla::gpu::PtxOptsFromConfig(config);
166 for (const std::string& arch_str : architectures_) {
167 // Parse CUDA architecture.
168 absl::string_view consumable_arch(arch_str);
169 bool is_compute_profile;
170 if (absl::ConsumePrefix(&consumable_arch, "compute_")) {
171 is_compute_profile = true;
172 } else if (absl::ConsumePrefix(&consumable_arch, "sm_")) {
173 is_compute_profile = false;
174 } else {
175 return InternalError(
176 "Could not parse cuda architecture prefix (expected sm_ or "
177 "compute_)");
178 }
179 uint32_t arch;
180 if (!absl::SimpleAtoi(consumable_arch, &arch)) {
181 return InternalError("Could not parse cuda architecture number");
182 }
183
184 uint32_t cc_major = arch / 10;
185 uint32_t cc_minor = arch % 10;
186 // Module may be changed by CompileToPtx.
187 auto llvm_module_copy = llvm::CloneModule(*llvmModule);
188 TF_ASSIGN_OR_RETURN(
189 std::string ptx,
190 xla::gpu::nvptx::CompileToPtx(llvm_module_copy.get(),
191 std::make_pair(cc_major, cc_minor),
192 config, libdevice_dir, enable_fusion));
193
194 if (print_ptx_) {
195 llvm::dbgs() << "Generated PTX code for module '"
196 << gpu_module.getName() << "' on architecture sm_" << arch
197 << ":\n";
198 llvm::dbgs() << ptx << "\n";
199 }
200
201 TF_ASSIGN_OR_RETURN(std::vector<uint8_t> gpu_asm,
202 tensorflow::se::CompileGpuAsm(
203 cc_major, cc_minor, ptx.c_str(), gpu_asm_opts));
204
205 if (!generate_fatbin_) {
206 // Skip fatbin generation and return the first and only GPU machine
207 // code. This is currently only used for `tf_to_gpu_binary` and will
208 // eventually disappear.
209 return gpu_asm;
210 }
211
212 // Collect cubin (and ptx image if requested).
213 images.push_back({absl::StrCat("sm_", arch), std::move(gpu_asm)});
214 if (is_compute_profile) {
215 std::vector<uint8_t> ptx_bytes;
216 std::copy(ptx.begin(), ptx.end(), std::back_inserter(ptx_bytes));
217 images.push_back(
218 {absl::StrCat("compute_", arch), std::move(ptx_bytes)});
219 }
220 }
221
222 // TODO(b/169870789): Revisit the use of fatbins.
223 // Bundle cubin and PTX images into a single fatbin.
224 return tensorflow::se::BundleGpuAsm(images,
225 gpu_asm_opts.preferred_cuda_dir);
226 #endif
227
228 return InternalError(
229 "Neither TENSORFLOW_USE_ROCM nor GOOGLE_CUDA are defined."
230 " Did you specify either --config=rocm or --config=cuda ?");
231 }
232
233 private:
GetLibdeviceDir(const xla::HloModuleConfig & hlo_module_config)234 xla::StatusOr<std::string> GetLibdeviceDir(
235 const xla::HloModuleConfig& hlo_module_config) {
236 for (const std::string& cuda_root : tensorflow::CandidateCudaRoots(
237 hlo_module_config.debug_options().xla_gpu_cuda_data_dir())) {
238 std::string libdevice_dir =
239 tensorflow::io::JoinPath(cuda_root, "nvvm", "libdevice");
240 VLOG(2) << "Looking for libdevice at " << libdevice_dir;
241 if (tensorflow::Env::Default()->IsDirectory(libdevice_dir).ok()) {
242 VLOG(2) << "Found libdevice dir " << libdevice_dir;
243 return libdevice_dir;
244 }
245 }
246 return InternalError(
247 "Can't find libdevice directory ${CUDA_DIR}/nvvm/libdevice");
248 }
249 bool enable_ftz_;
250 };
251
252 } // namespace
253
CreateGpuKernelToBlobPass(mlir::StringRef blob_annotation,ArrayRef<std::string> architectures,bool generate_fatbin,bool print_ptx,bool enable_ftz)254 std::unique_ptr<OperationPass<gpu::GPUModuleOp>> CreateGpuKernelToBlobPass(
255 mlir::StringRef blob_annotation, ArrayRef<std::string> architectures,
256 bool generate_fatbin, bool print_ptx, bool enable_ftz) {
257 return std::make_unique<GpuKernelToBlobPass>(
258 blob_annotation, architectures, generate_fatbin, print_ptx, enable_ftz);
259 }
260
261 } // namespace transforms
262 } // namespace kernel_gen
263 } // namespace mlir
264