• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "llvm/Transforms/Utils/Cloning.h"
17 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
18 #include "mlir/Target/LLVMIR.h"  // from @llvm-project
19 #include "mlir/Target/LLVMIR/Export.h"  // from @llvm-project
20 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
21 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
22 #include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
23 #include "tensorflow/compiler/xla/debug_options_flags.h"
24 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
25 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
26 #include "tensorflow/compiler/xla/service/gpu/target_constants.h"
27 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
28 #include "tensorflow/compiler/xla/status.h"
29 #include "tensorflow/compiler/xla/statusor.h"
30 #include "tensorflow/core/platform/cuda_libdevice_path.h"
31 #include "tensorflow/core/platform/logging.h"
32 #include "tensorflow/core/platform/path.h"
33 
34 #if GOOGLE_CUDA
35 #include "tensorflow/stream_executor/gpu/asm_compiler.h"
36 #elif TENSORFLOW_USE_ROCM
37 #include "tensorflow/core/platform/rocm_rocdl_path.h"
38 #include "tensorflow/stream_executor/gpu/asm_compiler.h"
39 #endif
40 
41 namespace mlir {
42 namespace kernel_gen {
43 namespace transforms {
44 namespace {
45 
46 #define GEN_PASS_CLASSES
47 #include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/kernel_gen_passes.h.inc"
48 
49 using xla::InternalError;
50 
51 class GpuKernelToBlobPass
52     : public GpuKernelToBlobPassBase<GpuKernelToBlobPass> {
53  public:
GpuKernelToBlobPass(mlir::StringRef blob_annotation,llvm::ArrayRef<std::string> architectures,bool generate_fatbin,bool print_ptx,bool enable_ftz)54   GpuKernelToBlobPass(mlir::StringRef blob_annotation,
55                       llvm::ArrayRef<std::string> architectures,
56                       bool generate_fatbin, bool print_ptx, bool enable_ftz) {
57     if (!blob_annotation.empty()) {
58       blob_annotation_ = blob_annotation.str();
59     }
60     architectures_ = architectures;
61     generate_fatbin_ = generate_fatbin;
62     print_ptx_ = print_ptx;
63     enable_ftz_ = enable_ftz;
64   }
65 
runOnOperation()66   void runOnOperation() override {
67     mlir::gpu::GPUModuleOp gpu_module = getOperation();
68     auto blob_or = GetGpuBinaryBlob(gpu_module);
69     if (blob_or.ok()) {
70       const auto& blob = blob_or.ValueOrDie();
71       std::string blob_string(blob.begin(), blob.end());
72       gpu_module->setAttr(blob_annotation_,
73                           mlir::StringAttr::get(&getContext(), blob_string));
74       return;
75     }
76     // Forward the error by attaching the message to the gpu module.
77     gpu_module.emitError(blob_or.status().error_message());
78     return signalPassFailure();
79   }
80 
GetGpuBinaryBlob(mlir::gpu::GPUModuleOp gpu_module)81   xla::StatusOr<std::vector<uint8_t>> GetGpuBinaryBlob(
82       mlir::gpu::GPUModuleOp gpu_module) {
83     if (architectures_.empty()) {
84       return InternalError("Expected at least one GPU architecture.");
85     }
86     if (!generate_fatbin_ && architectures_.size() > 1) {
87       return InternalError(
88           "Can only generate machine code for more than one architecture as a "
89           "fatbin.");
90     }
91 
92     llvm::LLVMContext llvmContext;
93     auto llvmModule = mlir::translateModuleToLLVMIR(gpu_module, llvmContext);
94 
95 #if TENSORFLOW_USE_ROCM
96     if (!llvmModule) {
97       return InternalError("Could not translate MLIR module to ROCDL IR");
98     }
99 
100     llvmModule->setModuleIdentifier("acme");
101 
102     xla::HloModuleConfig config;
103     xla::DebugOptions options = xla::GetDebugOptionsFromFlags();
104     options.set_xla_gpu_ftz(enable_ftz_);
105     config.set_debug_options(options);
106 
107     using AmdGpuHsaco = std::vector<tensorflow::uint8>;
108     std::vector<tensorflow::se::HsacoImage> images;
109     for (const std::string& arch_str : architectures_) {
110       // Parse ROCm architecture.
111       absl::string_view consumable_arch(arch_str);
112       if (!absl::ConsumePrefix(&consumable_arch, "gfx")) {
113         return InternalError(
114             "Could not parse ROCm architecture prefix (expected gfx)");
115       }
116       uint32_t arch;
117       if (!absl::SimpleAtoi(consumable_arch, &arch)) {
118         return InternalError("Could not parse ROCm architecture number");
119       }
120 
121       std::string libdevice_dir = tensorflow::RocdlRoot();
122       auto llvm_module_copy = llvm::CloneModule(*llvmModule);
123       xla::gpu::GpuVersion gpu_version{std::make_pair(arch, arch_str)};
124       auto hsaco_or = xla::gpu::amdgpu::CompileToHsaco(
125           llvm_module_copy.get(), gpu_version, config, libdevice_dir);
126       if (!hsaco_or.ok()) {
127         return InternalError("Failure when generating HSACO");
128       }
129 
130       auto hsaco = hsaco_or.ValueOrDie();
131       if (!generate_fatbin_) {
132         // Skip fatbin generation and return the first and only GPU machine
133         // code. This is currently only used for `tf_to_gpu_binary` and will
134         // eventually disappear.
135         return hsaco;
136       }
137 
138       images.push_back({arch_str, std::move(hsaco)});
139     }
140 
141     // TODO(b/169870789): Revisit the use of fatbins.
142     // Bundle HSACO images into a single fatbin.
143     return tensorflow::se::BundleGpuAsm(images, tensorflow::RocmRoot());
144 
145 #elif GOOGLE_CUDA
146     if (!llvmModule) {
147       return InternalError("Could not translate MLIR module to NVVM");
148     }
149 
150     llvmModule->setModuleIdentifier("acme");
151     llvmModule->setDataLayout(xla::gpu::nvptx::kDataLayout);
152 
153     xla::HloModuleConfig config;
154     xla::DebugOptions options = xla::GetDebugOptionsFromFlags();
155     options.set_xla_gpu_ftz(enable_ftz_);
156     config.set_debug_options(options);
157 
158     auto enable_fusion = [](llvm::TargetMachine* target) {
159       target->Options.AllowFPOpFusion = llvm::FPOpFusion::FPOpFusionMode::Fast;
160     };
161 
162     // Compile and collect requested cubin and PTX images.
163     std::vector<tensorflow::se::CubinOrPTXImage> images;
164     TF_ASSIGN_OR_RETURN(std::string libdevice_dir, GetLibdeviceDir(config));
165     auto gpu_asm_opts = xla::gpu::PtxOptsFromConfig(config);
166     for (const std::string& arch_str : architectures_) {
167       // Parse CUDA architecture.
168       absl::string_view consumable_arch(arch_str);
169       bool is_compute_profile;
170       if (absl::ConsumePrefix(&consumable_arch, "compute_")) {
171         is_compute_profile = true;
172       } else if (absl::ConsumePrefix(&consumable_arch, "sm_")) {
173         is_compute_profile = false;
174       } else {
175         return InternalError(
176             "Could not parse cuda architecture prefix (expected sm_ or "
177             "compute_)");
178       }
179       uint32_t arch;
180       if (!absl::SimpleAtoi(consumable_arch, &arch)) {
181         return InternalError("Could not parse cuda architecture number");
182       }
183 
184       uint32_t cc_major = arch / 10;
185       uint32_t cc_minor = arch % 10;
186       // Module may be changed by CompileToPtx.
187       auto llvm_module_copy = llvm::CloneModule(*llvmModule);
188       TF_ASSIGN_OR_RETURN(
189           std::string ptx,
190           xla::gpu::nvptx::CompileToPtx(llvm_module_copy.get(),
191                                         std::make_pair(cc_major, cc_minor),
192                                         config, libdevice_dir, enable_fusion));
193 
194       if (print_ptx_) {
195         llvm::dbgs() << "Generated PTX code for module '"
196                      << gpu_module.getName() << "' on architecture sm_" << arch
197                      << ":\n";
198         llvm::dbgs() << ptx << "\n";
199       }
200 
201       TF_ASSIGN_OR_RETURN(std::vector<uint8_t> gpu_asm,
202                           tensorflow::se::CompileGpuAsm(
203                               cc_major, cc_minor, ptx.c_str(), gpu_asm_opts));
204 
205       if (!generate_fatbin_) {
206         // Skip fatbin generation and return the first and only GPU machine
207         // code. This is currently only used for `tf_to_gpu_binary` and will
208         // eventually disappear.
209         return gpu_asm;
210       }
211 
212       // Collect cubin (and ptx image if requested).
213       images.push_back({absl::StrCat("sm_", arch), std::move(gpu_asm)});
214       if (is_compute_profile) {
215         std::vector<uint8_t> ptx_bytes;
216         std::copy(ptx.begin(), ptx.end(), std::back_inserter(ptx_bytes));
217         images.push_back(
218             {absl::StrCat("compute_", arch), std::move(ptx_bytes)});
219       }
220     }
221 
222     // TODO(b/169870789): Revisit the use of fatbins.
223     // Bundle cubin and PTX images into a single fatbin.
224     return tensorflow::se::BundleGpuAsm(images,
225                                         gpu_asm_opts.preferred_cuda_dir);
226 #endif
227 
228     return InternalError(
229         "Neither TENSORFLOW_USE_ROCM nor GOOGLE_CUDA are defined."
230         " Did you specify either --config=rocm or --config=cuda ?");
231   }
232 
233  private:
GetLibdeviceDir(const xla::HloModuleConfig & hlo_module_config)234   xla::StatusOr<std::string> GetLibdeviceDir(
235       const xla::HloModuleConfig& hlo_module_config) {
236     for (const std::string& cuda_root : tensorflow::CandidateCudaRoots(
237              hlo_module_config.debug_options().xla_gpu_cuda_data_dir())) {
238       std::string libdevice_dir =
239           tensorflow::io::JoinPath(cuda_root, "nvvm", "libdevice");
240       VLOG(2) << "Looking for libdevice at " << libdevice_dir;
241       if (tensorflow::Env::Default()->IsDirectory(libdevice_dir).ok()) {
242         VLOG(2) << "Found libdevice dir " << libdevice_dir;
243         return libdevice_dir;
244       }
245     }
246     return InternalError(
247         "Can't find libdevice directory ${CUDA_DIR}/nvvm/libdevice");
248   }
249   bool enable_ftz_;
250 };
251 
252 }  // namespace
253 
CreateGpuKernelToBlobPass(mlir::StringRef blob_annotation,ArrayRef<std::string> architectures,bool generate_fatbin,bool print_ptx,bool enable_ftz)254 std::unique_ptr<OperationPass<gpu::GPUModuleOp>> CreateGpuKernelToBlobPass(
255     mlir::StringRef blob_annotation, ArrayRef<std::string> architectures,
256     bool generate_fatbin, bool print_ptx, bool enable_ftz) {
257   return std::make_unique<GpuKernelToBlobPass>(
258       blob_annotation, architectures, generate_fatbin, print_ptx, enable_ftz);
259 }
260 
261 }  // namespace transforms
262 }  // namespace kernel_gen
263 }  // namespace mlir
264