• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "llvm/Transforms/Utils/Cloning.h"
17 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
18 #include "mlir/Target/LLVMIR/Export.h"  // from @llvm-project
19 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
20 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
21 #include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
22 #include "tensorflow/compiler/xla/debug_options_flags.h"
23 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
24 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
25 #include "tensorflow/compiler/xla/service/gpu/target_constants.h"
26 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
27 #include "tensorflow/compiler/xla/status.h"
28 #include "tensorflow/compiler/xla/statusor.h"
29 #include "tensorflow/core/platform/cuda_libdevice_path.h"
30 #include "tensorflow/core/platform/logging.h"
31 #include "tensorflow/core/platform/path.h"
32 
33 #if GOOGLE_CUDA
34 #include "tensorflow/stream_executor/gpu/asm_compiler.h"
35 #elif TENSORFLOW_USE_ROCM
36 #include "tensorflow/core/platform/rocm_rocdl_path.h"
37 #include "tensorflow/stream_executor/gpu/asm_compiler.h"
38 #endif
39 
40 namespace mlir {
41 namespace kernel_gen {
42 namespace transforms {
43 namespace {
44 
45 #define GEN_PASS_CLASSES
46 #include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/kernel_gen_passes.h.inc"
47 
48 using xla::InternalError;
49 
50 class GpuKernelToBlobPass
51     : public GpuKernelToBlobPassBase<GpuKernelToBlobPass> {
52  public:
GpuKernelToBlobPass(mlir::StringRef blob_annotation,llvm::ArrayRef<std::string> architectures,bool print_ptx,bool print_llvmir,bool enable_ftz)53   GpuKernelToBlobPass(mlir::StringRef blob_annotation,
54                       llvm::ArrayRef<std::string> architectures, bool print_ptx,
55                       bool print_llvmir, bool enable_ftz) {
56     if (!blob_annotation.empty()) {
57       blob_annotation_ = blob_annotation.str();
58     }
59     architectures_ = architectures;
60     print_ptx_ = print_ptx;
61     print_llvmir_ = print_llvmir;
62     enable_ftz_ = enable_ftz;
63   }
64 
runOnOperation()65   void runOnOperation() override {
66     mlir::gpu::GPUModuleOp gpu_module = getOperation();
67     auto blob_or = GetGpuBinaryBlob(gpu_module);
68     if (blob_or.ok()) {
69       const auto& blob = blob_or.ValueOrDie();
70       std::string blob_string(blob.begin(), blob.end());
71       gpu_module->setAttr(blob_annotation_,
72                           mlir::StringAttr::get(&getContext(), blob_string));
73       return;
74     }
75     // Forward the error by attaching the message to the gpu module.
76     gpu_module.emitError(blob_or.status().error_message());
77     return signalPassFailure();
78   }
79 
GetGpuBinaryBlob(mlir::gpu::GPUModuleOp gpu_module)80   xla::StatusOr<std::vector<uint8_t>> GetGpuBinaryBlob(
81       mlir::gpu::GPUModuleOp gpu_module) {
82     if (architectures_.empty()) {
83       return InternalError("Expected at least one GPU architecture.");
84     }
85 
86     llvm::LLVMContext llvmContext;
87     auto llvmModule = mlir::translateModuleToLLVMIR(gpu_module, llvmContext);
88 
89     if (!llvmModule) {
90       return InternalError("Could not translate MLIR module to LLVM IR");
91     }
92 
93     llvmModule->setModuleIdentifier(gpu_module.getName());
94 
95 #if TENSORFLOW_USE_ROCM
96     xla::HloModuleConfig config;
97     xla::DebugOptions options = xla::GetDebugOptionsFromFlags();
98     options.set_xla_gpu_ftz(enable_ftz_);
99     options.set_xla_gpu_dump_llvmir(print_llvmir_);
100     config.set_debug_options(options);
101 
102     using AmdGpuHsaco = std::vector<tensorflow::uint8>;
103     std::vector<tensorflow::se::HsacoImage> images;
104     for (const std::string& arch_str : architectures_) {
105       // Parse ROCm architecture.
106       absl::string_view consumable_arch(arch_str);
107       if (!absl::ConsumePrefix(&consumable_arch, "gfx")) {
108         return InternalError(
109             "Could not parse ROCm architecture prefix (expected gfx)");
110       }
111       std::string libdevice_dir = tensorflow::RocdlRoot();
112       auto llvm_module_copy = llvm::CloneModule(*llvmModule);
113       xla::gpu::GpuVersion gpu_version{arch_str};
114       auto hsaco_or = xla::gpu::amdgpu::CompileToHsaco(
115           llvm_module_copy.get(), gpu_version, config, libdevice_dir);
116       if (!hsaco_or.ok()) {
117         return InternalError("Failure when generating HSACO");
118       }
119       auto hsaco = hsaco_or.ValueOrDie();
120       images.push_back({arch_str, std::move(hsaco)});
121     }
122 
123     // TODO(b/169870789): Revisit the use of fatbins.
124     // Bundle HSACO images into a single fatbin.
125     return tensorflow::se::BundleGpuAsm(images, tensorflow::RocmRoot());
126 
127 #elif GOOGLE_CUDA
128     llvmModule->setDataLayout(xla::gpu::nvptx::kDataLayout);
129 
130     xla::HloModuleConfig config;
131     xla::DebugOptions options = xla::GetDebugOptionsFromFlags();
132     options.set_xla_gpu_ftz(enable_ftz_);
133     options.set_xla_gpu_dump_llvmir(print_llvmir_);
134     // Make sure we use full precision division operations.
135     (*options.mutable_xla_backend_extra_options())["-nvptx-prec-divf32"] = "2";
136     // Disable tail sinking as it interferes with load/store vectorization. If
137     // we have common tails that is intentional.
138     (*options.mutable_xla_backend_extra_options())["-simplifycfg-sink-common"] =
139         "false";
140 
141     config.set_debug_options(options);
142 
143     auto enable_fusion = [](llvm::TargetMachine* target) {
144       target->Options.AllowFPOpFusion = llvm::FPOpFusion::FPOpFusionMode::Fast;
145     };
146 
147     // Compile and collect requested cubin and PTX images.
148     std::vector<tensorflow::se::CubinOrPTXImage> images;
149     TF_ASSIGN_OR_RETURN(std::string libdevice_dir, GetLibdeviceDir(config));
150     auto gpu_asm_opts = xla::gpu::PtxOptsFromConfig(config);
151     for (const std::string& arch_str : architectures_) {
152       // Parse CUDA architecture.
153       absl::string_view consumable_arch(arch_str);
154       bool is_compute_profile;
155       if (absl::ConsumePrefix(&consumable_arch, "compute_")) {
156         is_compute_profile = true;
157       } else if (absl::ConsumePrefix(&consumable_arch, "sm_")) {
158         is_compute_profile = false;
159       } else {
160         return InternalError(
161             "Could not parse cuda architecture prefix (expected sm_ or "
162             "compute_)");
163       }
164       uint32_t arch;
165       if (!absl::SimpleAtoi(consumable_arch, &arch)) {
166         return InternalError("Could not parse cuda architecture number");
167       }
168 
169       int cc_major = arch / 10;
170       int cc_minor = arch % 10;
171       // Module may be changed by CompileToPtx.
172       auto llvm_module_copy = llvm::CloneModule(*llvmModule);
173       TF_ASSIGN_OR_RETURN(
174           std::string ptx,
175           xla::gpu::nvptx::CompileToPtx(
176               llvm_module_copy.get(),
177               tensorflow::se::CudaComputeCapability{cc_major, cc_minor}, config,
178               libdevice_dir, enable_fusion));
179 
180       if (print_ptx_) {
181         llvm::dbgs() << "Generated PTX code for module '"
182                      << gpu_module.getName() << "' on architecture sm_" << arch
183                      << ":\n";
184         llvm::dbgs() << ptx << "\n";
185       }
186 
187       TF_ASSIGN_OR_RETURN(std::vector<uint8_t> gpu_asm,
188                           tensorflow::se::CompileGpuAsm(
189                               cc_major, cc_minor, ptx.c_str(), gpu_asm_opts));
190 
191       // Collect cubin (and ptx image if requested).
192       images.push_back({absl::StrCat("sm_", arch), std::move(gpu_asm)});
193       if (is_compute_profile) {
194         std::vector<uint8_t> ptx_bytes;
195         std::copy(ptx.begin(), ptx.end(), std::back_inserter(ptx_bytes));
196         images.push_back(
197             {absl::StrCat("compute_", arch), std::move(ptx_bytes)});
198       }
199     }
200 
201     // TODO(b/169870789): Revisit the use of fatbins.
202     // Bundle cubin and PTX images into a single fatbin.
203     return tensorflow::se::BundleGpuAsm(images, gpu_asm_opts);
204 #endif
205 
206     return InternalError(
207         "Neither TENSORFLOW_USE_ROCM nor GOOGLE_CUDA are defined."
208         " Did you specify either --config=rocm or --config=cuda ?");
209   }
210 
211  private:
GetLibdeviceDir(const xla::HloModuleConfig & hlo_module_config)212   xla::StatusOr<std::string> GetLibdeviceDir(
213       const xla::HloModuleConfig& hlo_module_config) {
214     for (const std::string& cuda_root : tensorflow::CandidateCudaRoots(
215              hlo_module_config.debug_options().xla_gpu_cuda_data_dir())) {
216       std::string libdevice_dir =
217           tensorflow::io::JoinPath(cuda_root, "nvvm", "libdevice");
218       VLOG(2) << "Looking for libdevice at " << libdevice_dir;
219       if (tensorflow::Env::Default()->IsDirectory(libdevice_dir).ok()) {
220         VLOG(2) << "Found libdevice dir " << libdevice_dir;
221         return libdevice_dir;
222       }
223     }
224     return InternalError(
225         "Can't find libdevice directory ${CUDA_DIR}/nvvm/libdevice");
226   }
227   bool enable_ftz_;
228 };
229 
230 }  // namespace
231 
CreateGpuKernelToBlobPass(mlir::StringRef blob_annotation,ArrayRef<std::string> architectures,bool print_ptx,bool print_llvmir,bool enable_ftz)232 std::unique_ptr<OperationPass<gpu::GPUModuleOp>> CreateGpuKernelToBlobPass(
233     mlir::StringRef blob_annotation, ArrayRef<std::string> architectures,
234     bool print_ptx, bool print_llvmir, bool enable_ftz) {
235   return std::make_unique<GpuKernelToBlobPass>(
236       blob_annotation, architectures, print_ptx, print_llvmir, enable_ftz);
237 }
238 
239 }  // namespace transforms
240 }  // namespace kernel_gen
241 }  // namespace mlir
242