1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "llvm/Transforms/Utils/Cloning.h"
17 #include "mlir/Dialect/StandardOps/IR/Ops.h" // from @llvm-project
18 #include "mlir/Target/LLVMIR/Export.h" // from @llvm-project
19 #include "mlir/Transforms/DialectConversion.h" // from @llvm-project
20 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
21 #include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
22 #include "tensorflow/compiler/xla/debug_options_flags.h"
23 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
24 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
25 #include "tensorflow/compiler/xla/service/gpu/target_constants.h"
26 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
27 #include "tensorflow/compiler/xla/status.h"
28 #include "tensorflow/compiler/xla/statusor.h"
29 #include "tensorflow/core/platform/cuda_libdevice_path.h"
30 #include "tensorflow/core/platform/logging.h"
31 #include "tensorflow/core/platform/path.h"
32
33 #if GOOGLE_CUDA
34 #include "tensorflow/stream_executor/gpu/asm_compiler.h"
35 #elif TENSORFLOW_USE_ROCM
36 #include "tensorflow/core/platform/rocm_rocdl_path.h"
37 #include "tensorflow/stream_executor/gpu/asm_compiler.h"
38 #endif
39
40 namespace mlir {
41 namespace kernel_gen {
42 namespace transforms {
43 namespace {
44
45 #define GEN_PASS_CLASSES
46 #include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/kernel_gen_passes.h.inc"
47
48 using xla::InternalError;
49
50 class GpuKernelToBlobPass
51 : public GpuKernelToBlobPassBase<GpuKernelToBlobPass> {
52 public:
GpuKernelToBlobPass(mlir::StringRef blob_annotation,llvm::ArrayRef<std::string> architectures,bool print_ptx,bool print_llvmir,bool enable_ftz)53 GpuKernelToBlobPass(mlir::StringRef blob_annotation,
54 llvm::ArrayRef<std::string> architectures, bool print_ptx,
55 bool print_llvmir, bool enable_ftz) {
56 if (!blob_annotation.empty()) {
57 blob_annotation_ = blob_annotation.str();
58 }
59 architectures_ = architectures;
60 print_ptx_ = print_ptx;
61 print_llvmir_ = print_llvmir;
62 enable_ftz_ = enable_ftz;
63 }
64
runOnOperation()65 void runOnOperation() override {
66 mlir::gpu::GPUModuleOp gpu_module = getOperation();
67 auto blob_or = GetGpuBinaryBlob(gpu_module);
68 if (blob_or.ok()) {
69 const auto& blob = blob_or.ValueOrDie();
70 std::string blob_string(blob.begin(), blob.end());
71 gpu_module->setAttr(blob_annotation_,
72 mlir::StringAttr::get(&getContext(), blob_string));
73 return;
74 }
75 // Forward the error by attaching the message to the gpu module.
76 gpu_module.emitError(blob_or.status().error_message());
77 return signalPassFailure();
78 }
79
GetGpuBinaryBlob(mlir::gpu::GPUModuleOp gpu_module)80 xla::StatusOr<std::vector<uint8_t>> GetGpuBinaryBlob(
81 mlir::gpu::GPUModuleOp gpu_module) {
82 if (architectures_.empty()) {
83 return InternalError("Expected at least one GPU architecture.");
84 }
85
86 llvm::LLVMContext llvmContext;
87 auto llvmModule = mlir::translateModuleToLLVMIR(gpu_module, llvmContext);
88
89 if (!llvmModule) {
90 return InternalError("Could not translate MLIR module to LLVM IR");
91 }
92
93 llvmModule->setModuleIdentifier(gpu_module.getName());
94
95 #if TENSORFLOW_USE_ROCM
96 xla::HloModuleConfig config;
97 xla::DebugOptions options = xla::GetDebugOptionsFromFlags();
98 options.set_xla_gpu_ftz(enable_ftz_);
99 options.set_xla_gpu_dump_llvmir(print_llvmir_);
100 config.set_debug_options(options);
101
102 using AmdGpuHsaco = std::vector<tensorflow::uint8>;
103 std::vector<tensorflow::se::HsacoImage> images;
104 for (const std::string& arch_str : architectures_) {
105 // Parse ROCm architecture.
106 absl::string_view consumable_arch(arch_str);
107 if (!absl::ConsumePrefix(&consumable_arch, "gfx")) {
108 return InternalError(
109 "Could not parse ROCm architecture prefix (expected gfx)");
110 }
111 std::string libdevice_dir = tensorflow::RocdlRoot();
112 auto llvm_module_copy = llvm::CloneModule(*llvmModule);
113 xla::gpu::GpuVersion gpu_version{arch_str};
114 auto hsaco_or = xla::gpu::amdgpu::CompileToHsaco(
115 llvm_module_copy.get(), gpu_version, config, libdevice_dir);
116 if (!hsaco_or.ok()) {
117 return InternalError("Failure when generating HSACO");
118 }
119 auto hsaco = hsaco_or.ValueOrDie();
120 images.push_back({arch_str, std::move(hsaco)});
121 }
122
123 // TODO(b/169870789): Revisit the use of fatbins.
124 // Bundle HSACO images into a single fatbin.
125 return tensorflow::se::BundleGpuAsm(images, tensorflow::RocmRoot());
126
127 #elif GOOGLE_CUDA
128 llvmModule->setDataLayout(xla::gpu::nvptx::kDataLayout);
129
130 xla::HloModuleConfig config;
131 xla::DebugOptions options = xla::GetDebugOptionsFromFlags();
132 options.set_xla_gpu_ftz(enable_ftz_);
133 options.set_xla_gpu_dump_llvmir(print_llvmir_);
134 // Make sure we use full precision division operations.
135 (*options.mutable_xla_backend_extra_options())["-nvptx-prec-divf32"] = "2";
136 // Disable tail sinking as it interferes with load/store vectorization. If
137 // we have common tails that is intentional.
138 (*options.mutable_xla_backend_extra_options())["-simplifycfg-sink-common"] =
139 "false";
140
141 config.set_debug_options(options);
142
143 auto enable_fusion = [](llvm::TargetMachine* target) {
144 target->Options.AllowFPOpFusion = llvm::FPOpFusion::FPOpFusionMode::Fast;
145 };
146
147 // Compile and collect requested cubin and PTX images.
148 std::vector<tensorflow::se::CubinOrPTXImage> images;
149 TF_ASSIGN_OR_RETURN(std::string libdevice_dir, GetLibdeviceDir(config));
150 auto gpu_asm_opts = xla::gpu::PtxOptsFromConfig(config);
151 for (const std::string& arch_str : architectures_) {
152 // Parse CUDA architecture.
153 absl::string_view consumable_arch(arch_str);
154 bool is_compute_profile;
155 if (absl::ConsumePrefix(&consumable_arch, "compute_")) {
156 is_compute_profile = true;
157 } else if (absl::ConsumePrefix(&consumable_arch, "sm_")) {
158 is_compute_profile = false;
159 } else {
160 return InternalError(
161 "Could not parse cuda architecture prefix (expected sm_ or "
162 "compute_)");
163 }
164 uint32_t arch;
165 if (!absl::SimpleAtoi(consumable_arch, &arch)) {
166 return InternalError("Could not parse cuda architecture number");
167 }
168
169 int cc_major = arch / 10;
170 int cc_minor = arch % 10;
171 // Module may be changed by CompileToPtx.
172 auto llvm_module_copy = llvm::CloneModule(*llvmModule);
173 TF_ASSIGN_OR_RETURN(
174 std::string ptx,
175 xla::gpu::nvptx::CompileToPtx(
176 llvm_module_copy.get(),
177 tensorflow::se::CudaComputeCapability{cc_major, cc_minor}, config,
178 libdevice_dir, enable_fusion));
179
180 if (print_ptx_) {
181 llvm::dbgs() << "Generated PTX code for module '"
182 << gpu_module.getName() << "' on architecture sm_" << arch
183 << ":\n";
184 llvm::dbgs() << ptx << "\n";
185 }
186
187 TF_ASSIGN_OR_RETURN(std::vector<uint8_t> gpu_asm,
188 tensorflow::se::CompileGpuAsm(
189 cc_major, cc_minor, ptx.c_str(), gpu_asm_opts));
190
191 // Collect cubin (and ptx image if requested).
192 images.push_back({absl::StrCat("sm_", arch), std::move(gpu_asm)});
193 if (is_compute_profile) {
194 std::vector<uint8_t> ptx_bytes;
195 std::copy(ptx.begin(), ptx.end(), std::back_inserter(ptx_bytes));
196 images.push_back(
197 {absl::StrCat("compute_", arch), std::move(ptx_bytes)});
198 }
199 }
200
201 // TODO(b/169870789): Revisit the use of fatbins.
202 // Bundle cubin and PTX images into a single fatbin.
203 return tensorflow::se::BundleGpuAsm(images, gpu_asm_opts);
204 #endif
205
206 return InternalError(
207 "Neither TENSORFLOW_USE_ROCM nor GOOGLE_CUDA are defined."
208 " Did you specify either --config=rocm or --config=cuda ?");
209 }
210
211 private:
GetLibdeviceDir(const xla::HloModuleConfig & hlo_module_config)212 xla::StatusOr<std::string> GetLibdeviceDir(
213 const xla::HloModuleConfig& hlo_module_config) {
214 for (const std::string& cuda_root : tensorflow::CandidateCudaRoots(
215 hlo_module_config.debug_options().xla_gpu_cuda_data_dir())) {
216 std::string libdevice_dir =
217 tensorflow::io::JoinPath(cuda_root, "nvvm", "libdevice");
218 VLOG(2) << "Looking for libdevice at " << libdevice_dir;
219 if (tensorflow::Env::Default()->IsDirectory(libdevice_dir).ok()) {
220 VLOG(2) << "Found libdevice dir " << libdevice_dir;
221 return libdevice_dir;
222 }
223 }
224 return InternalError(
225 "Can't find libdevice directory ${CUDA_DIR}/nvvm/libdevice");
226 }
227 bool enable_ftz_;
228 };
229
230 } // namespace
231
CreateGpuKernelToBlobPass(mlir::StringRef blob_annotation,ArrayRef<std::string> architectures,bool print_ptx,bool print_llvmir,bool enable_ftz)232 std::unique_ptr<OperationPass<gpu::GPUModuleOp>> CreateGpuKernelToBlobPass(
233 mlir::StringRef blob_annotation, ArrayRef<std::string> architectures,
234 bool print_ptx, bool print_llvmir, bool enable_ftz) {
235 return std::make_unique<GpuKernelToBlobPass>(
236 blob_annotation, architectures, print_ptx, print_llvmir, enable_ftz);
237 }
238
239 } // namespace transforms
240 } // namespace kernel_gen
241 } // namespace mlir
242