1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
17 
18 #include <fstream>
19 #include <map>
20 #include <memory>
21 #include <string>
22 #include <utility>
23 
24 #include "absl/base/call_once.h"
25 #include "absl/memory/memory.h"
26 #include "absl/strings/str_cat.h"
27 #include "absl/strings/string_view.h"
28 #include "llvm/ADT/STLExtras.h"
29 #include "llvm/ADT/StringMap.h"
30 #include "llvm/ADT/StringSet.h"
31 #include "llvm/Analysis/TargetLibraryInfo.h"
32 #include "llvm/Analysis/TargetTransformInfo.h"
33 #include "llvm/Bitcode/BitcodeReader.h"
34 #include "llvm/Bitcode/BitcodeWriter.h"
35 #include "llvm/CodeGen/CommandFlags.h"
36 #include "llvm/IR/LLVMContext.h"
37 #include "llvm/IR/LegacyPassManager.h"
38 #include "llvm/IR/Module.h"
39 #include "llvm/IR/Verifier.h"
40 #include "llvm/InitializePasses.h"
41 #include "llvm/Linker/Linker.h"
42 #include "llvm/PassRegistry.h"
43 #include "llvm/Support/CommandLine.h"
44 #include "llvm/Support/FileSystem.h"
45 #include "llvm/Support/FormattedStream.h"
46 #include "llvm/Support/Program.h"
47 #include "llvm/Support/TargetRegistry.h"
48 #include "llvm/Support/TargetSelect.h"
49 #include "llvm/Support/ToolOutputFile.h"
50 #include "llvm/Target/TargetMachine.h"
51 #include "llvm/Transforms/IPO.h"
52 #include "llvm/Transforms/IPO/AlwaysInliner.h"
53 #include "llvm/Transforms/IPO/Internalize.h"
54 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
55 #include "llvm/Transforms/Scalar.h"
56 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.h"
57 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.h"
58 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
59 #include "tensorflow/compiler/xla/status_macros.h"
60 #include "tensorflow/compiler/xla/types.h"
61 #include "tensorflow/compiler/xla/util.h"
62 #include "tensorflow/core/lib/io/path.h"
63 #include "tensorflow/core/platform/env.h"
64 #include "tensorflow/core/platform/logging.h"
65 #include "tensorflow/core/platform/path.h"
66 #include "tensorflow/core/platform/random.h"
67 #include "tensorflow/core/platform/tracing.h"
68 #include "tensorflow/core/profiler/lib/traceme.h"
69 #include "tensorflow/core/util/env_var.h"
70 
71 #if !defined(PLATFORM_GOOGLE) && TENSORFLOW_USE_ROCM
72 #include "rocm/rocm_config.h"
73 #endif
74 
75 namespace xla {
76 namespace gpu {
77 namespace {
78 
79 static llvm::codegen::RegisterCodeGenFlags CGF;
80 
81 // Inline threshold value to use in LLVM AMDGPU backend.
82 const int kAMDGPUInlineThreshold = 0x100000;
83 
84 // Default inline threshold value to use in llvm.
85 const int kDefaultInlineThreshold = 1100;
86 
87 // Gets the GPU name as it's known to LLVM for a given compute
88 // capability.  If we see an unrecognized compute capability, we
89 // return the highest one that is known and below the selected device.
GetSmName(se::CudaComputeCapability compute_capability)90 static string GetSmName(se::CudaComputeCapability compute_capability) {
91   int compute_capability_version =
92       compute_capability.major * 10 + compute_capability.minor;
93   int sm_version = 30;
94   // If the current compute capability isn't known, fallback to the
95   // most recent version before it.
96   int supported_versions[] = {75, 72, 70, 62, 61, 60, 53,
97                               52, 50, 37, 35, 32, 30};
98   for (int v : supported_versions) {
99     if (v <= compute_capability_version) {
100       sm_version = v;
101       break;
102     }
103   }
104 
105   // If the current CC isn't supported by LLVM and it is newer then
106   // the max supported LLVM version, do not warn about it. The end
107   // user can't do anything about this. PTX compiled for SM75 will
108   // run on SM80 too.
109   if (sm_version != compute_capability_version &&
110       compute_capability_version < supported_versions[0]) {
111     LOG(WARNING) << "Unknown compute capability "
112                  << compute_capability.ToString()
113                  << ". Defaulting to telling LLVM that we're compiling for sm_"
114                  << sm_version;
115   }
116   return absl::StrCat("sm_", sm_version);
117 }
118 
119 // Convenience function for producing a name of a temporary compilation product
120 // from the input filename.
MakeNameForTempProduct(absl::string_view input_filename,absl::string_view extension)121 string MakeNameForTempProduct(absl::string_view input_filename,
122                               absl::string_view extension) {
123   return ReplaceFilenameExtension(tensorflow::io::Basename(input_filename),
124                                   extension);
125 }
126 
127 // Initializes LLVM passes. Uses the PassRegistry mechanism.
InitializePasses(llvm::PassRegistry * pass_registry)128 void InitializePasses(llvm::PassRegistry* pass_registry) {
129   llvm::initializeCore(*pass_registry);
130   llvm::initializeCodeGen(*pass_registry);
131   llvm::initializeScalarOpts(*pass_registry);
132   llvm::initializeObjCARCOpts(*pass_registry);
133   llvm::initializeVectorization(*pass_registry);
134   llvm::initializeIPO(*pass_registry);
135   llvm::initializeAnalysis(*pass_registry);
136   llvm::initializeTransformUtils(*pass_registry);
137   llvm::initializeInstCombine(*pass_registry);
138   llvm::initializeInstrumentation(*pass_registry);
139   llvm::initializeTarget(*pass_registry);
140   llvm::initializeCodeGenPreparePass(*pass_registry);
141 }
142 
143 // Returns the TargetMachine, given a triple.
GetTargetMachine(llvm::Triple triple,absl::string_view cpu_name,const HloModuleConfig & hlo_module_config,absl::string_view feature_str)144 std::unique_ptr<llvm::TargetMachine> GetTargetMachine(
145     llvm::Triple triple, absl::string_view cpu_name,
146     const HloModuleConfig& hlo_module_config, absl::string_view feature_str) {
147   std::string error;
148   const llvm::Target* target =
149       llvm::TargetRegistry::lookupTarget("", triple, error);
150   if (target == nullptr) {
151     LOG(FATAL) << "Unable to find Target for triple '" << triple.str() << "'"
152                << " -- " << error;
153     return nullptr;
154   }
155 
156   llvm::TargetOptions target_options =
157       llvm::codegen::InitTargetOptionsFromCodeGenFlags(llvm::Triple());
158 
159   // Set the verbose assembly options.
160   target_options.MCOptions.AsmVerbose = false;
161 
162   // The selection of codegen optimization level is copied from function
163   // GetCodeGenOptLevel in //third_party/llvm/llvm/tools/opt/opt.cpp.
164   llvm::CodeGenOpt::Level codegen_opt_level;
165   switch (hlo_module_config.debug_options().xla_backend_optimization_level()) {
166     case 1:
167       codegen_opt_level = llvm::CodeGenOpt::Less;
168       break;
169     case 2:
170       codegen_opt_level = llvm::CodeGenOpt::Default;
171       break;
172     case 3:
173       codegen_opt_level = llvm::CodeGenOpt::Aggressive;
174       break;
175     default:
176       codegen_opt_level = llvm::CodeGenOpt::None;
177   }
178   return absl::WrapUnique(target->createTargetMachine(
179       triple.str(), llvm_ir::AsStringRef(cpu_name),
180       llvm_ir::AsStringRef(feature_str), target_options,
181       llvm::codegen::getExplicitRelocModel(),
182       llvm::codegen::getExplicitCodeModel(), codegen_opt_level));
183 }
184 
185 // Adds the standard LLVM optimization passes, based on the speed optimization
186 // level (opt_level) and size optimization level (size_level). Both module
187 // and function-level passes are added, so two pass managers are passed in and
188 // modified by this function.
AddOptimizationPasses(unsigned opt_level,unsigned size_level,llvm::TargetMachine * target_machine,llvm::legacy::PassManagerBase * module_passes,llvm::legacy::FunctionPassManager * function_passes,int inline_threshold)189 void AddOptimizationPasses(unsigned opt_level, unsigned size_level,
190                            llvm::TargetMachine* target_machine,
191                            llvm::legacy::PassManagerBase* module_passes,
192                            llvm::legacy::FunctionPassManager* function_passes,
193                            int inline_threshold) {
194   llvm::PassManagerBuilder builder;
195   builder.OptLevel = opt_level;
196   builder.SizeLevel = size_level;
197 
198   if (opt_level > 1) {
199     builder.Inliner = llvm::createFunctionInliningPass(inline_threshold);
200   } else {
201     // Only inline functions marked with "alwaysinline".
202     builder.Inliner = llvm::createAlwaysInlinerLegacyPass();
203   }
204 
205   builder.DisableUnrollLoops = opt_level == 0;
206   builder.LoopVectorize = opt_level > 0;
207   builder.SLPVectorize = opt_level > 1 && size_level < 2;
208 
209   // NVPTX's early-as-possible passes include NVVM reflect.
210   target_machine->adjustPassManager(builder);
211 
212   builder.populateFunctionPassManager(*function_passes);
213   builder.populateModulePassManager(*module_passes);
214 }
215 
216 // Emits the given module to a bit code file.
EmitBitcodeToFile(const llvm::Module & module,absl::string_view filename)217 void EmitBitcodeToFile(const llvm::Module& module, absl::string_view filename) {
218   std::error_code error_code;
219   llvm::ToolOutputFile outfile(string(filename).c_str(), error_code,
220                                llvm::sys::fs::OF_None);
221   if (error_code) {
222     LOG(FATAL) << "opening bitcode file for writing: " << error_code.message();
223   }
224 
225   llvm::WriteBitcodeToFile(module, outfile.os());
226   outfile.keep();
227 }
228 
229 // Emits the given module to PTX. target_machine is an initialized TargetMachine
230 // for the NVPTX target.
EmitModuleToPTX(llvm::Module * module,llvm::TargetMachine * target_machine)231 string EmitModuleToPTX(llvm::Module* module,
232                        llvm::TargetMachine* target_machine) {
233   std::string ptx;
234   {
235     llvm::raw_string_ostream stream(ptx);
236     llvm::buffer_ostream pstream(stream);
237     // The extension is stripped by IrDumpingPassManager, so we need to
238     // get creative to add a suffix.
239     IrDumpingPassManager codegen_passes(
240         MakeNameForTempProduct(module->getModuleIdentifier(), "-nvptx.dummy"),
241         "", false);
242     codegen_passes.add(new llvm::TargetLibraryInfoWrapperPass(
243         llvm::Triple(module->getTargetTriple())));
244 
245     target_machine->addPassesToEmitFile(codegen_passes, pstream, nullptr,
246                                         llvm::CGFT_AssemblyFile);
247     codegen_passes.run(*module);
248   }
249 
250   return ptx;
251 }
252 
253 // LLVM has an extensive flags mechanism of its own, which is only accessible
254 // through the command line. Internal libraries within LLVM register parsers for
255 // flags, with no other way to configure them except pass these flags.
256 // To do this programmatically, we invoke ParseCommandLineOptions manually with
257 // a "fake argv".
258 // Note: setting flags with this method is stateful, since flags are just
259 // static globals within LLVM libraries.
FeedLLVMWithFlags(const std::vector<string> & cl_opts)260 void FeedLLVMWithFlags(const std::vector<string>& cl_opts) {
261   std::vector<const char*> fake_argv = {""};
262   for (const string& cl_opt : cl_opts) {
263     fake_argv.push_back(cl_opt.c_str());
264   }
265   llvm::cl::ParseCommandLineOptions(fake_argv.size(), &fake_argv[0]);
266 }
267 
268 // Returns whether the module could use any device bitcode library functions.
CouldNeedDeviceBitcode(const llvm::Module & module)269 bool CouldNeedDeviceBitcode(const llvm::Module& module) {
270   for (const llvm::Function& function : module.functions()) {
271     // The list of prefixes should be in sync with library functions used in
272     // target_util.cc.
273     if (!function.isIntrinsic() && function.isDeclaration() &&
274         (function.getName().startswith("__nv_") ||
275          function.getName().startswith("__ocml_") ||
276          function.getName().startswith("__ockl_"))) {
277       return true;
278     }
279   }
280   return false;
281 }
282 
283 // Links the module with a vector of path to bitcode modules.
284 // The caller must guarantee that the paths exist.
LinkWithBitcodeVector(llvm::Module * module,const std::vector<string> & bitcode_path_vector)285 Status LinkWithBitcodeVector(llvm::Module* module,
286                              const std::vector<string>& bitcode_path_vector) {
287   llvm::Linker linker(*module);
288 
289   for (auto& bitcode_path : bitcode_path_vector) {
290     if (!tensorflow::Env::Default()->FileExists(bitcode_path).ok()) {
291       LOG(ERROR) << "bitcode module is required by this HLO module but was "
292                     "not found at "
293                  << bitcode_path;
294       return xla::InternalError("bitcode module not found at %s", bitcode_path);
295     }
296 
297     std::unique_ptr<llvm::Module> bitcode_module =
298         LoadIRModule(bitcode_path, &module->getContext());
299     // Ignore the data layout of the module we're importing. This avoids a
300     // warning from the linker.
301     bitcode_module->setDataLayout(module->getDataLayout());
302     if (linker.linkInModule(
303             std::move(bitcode_module), llvm::Linker::Flags::LinkOnlyNeeded,
304             [](llvm::Module& M, const llvm::StringSet<>& GVS) {
305               internalizeModule(M, [&GVS](const llvm::GlobalValue& GV) {
306                 return !GV.hasName() || (GVS.count(GV.getName()) == 0);
307               });
308             })) {
309       return xla::InternalError("Error linking bitcode module from %s",
310                                 bitcode_path);
311     }
312   }
313   return Status::OK();
314 }
315 
316 // Links libdevice into the given module if the module needs libdevice.
LinkLibdeviceIfNecessary(llvm::Module * module,const string & libdevice_dir_path)317 Status LinkLibdeviceIfNecessary(llvm::Module* module,
318                                 const string& libdevice_dir_path) {
319   if (!CouldNeedDeviceBitcode(*module)) {
320     return Status::OK();
321   }
322 
323   // CUDA 9+ uses a single libdevice file for all devices, and we don't support
324   // older CUDAs.
325   string libdevice_path =
326       tensorflow::io::JoinPath(libdevice_dir_path, "libdevice.10.bc");
327   if (!tensorflow::Env::Default()->FileExists(libdevice_path).ok()) {
328     LOG(WARNING)
329         << "libdevice is required by this HLO module but was not found at "
330         << libdevice_path;
331     return xla::InternalError("libdevice not found at %s", libdevice_path);
332   }
333 
334   VLOG(1) << "Linking with libdevice from: " << libdevice_path;
335   return LinkWithBitcodeVector(module, {libdevice_path});
336 }
337 
NVPTXTargetModuleLinker(llvm::Module * module,GpuVersion gpu_version,const HloModuleConfig & hlo_module_config,const string & device_bitcode_dir_path)338 Status NVPTXTargetModuleLinker(llvm::Module* module, GpuVersion gpu_version,
339                                const HloModuleConfig& hlo_module_config,
340                                const string& device_bitcode_dir_path) {
341   // Link the input module with libdevice, to pull in implementations of some
342   // builtins.
343   TF_RETURN_IF_ERROR(LinkLibdeviceIfNecessary(module, device_bitcode_dir_path));
344 
345   // Set the flush-denormals-to-zero flag on the module so the NVVM reflect pass
346   // can access it.
347   module->addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz",
348                         hlo_module_config.debug_options().xla_gpu_ftz());
349 
350   // If ftz is enabled, set it as an attribute on every function in the module.
351   if (hlo_module_config.debug_options().xla_gpu_ftz()) {
352     for (llvm::Function& fn : *module) {
353       fn.addFnAttr("denormal-fp-math-f32", "preserve-sign");
354     }
355   }
356 
357   return Status::OK();
358 }
359 
NVPTXGetTargetMachine(llvm::Triple target_triple,se::CudaComputeCapability compute_capability,const HloModuleConfig & hlo_module_config)360 std::unique_ptr<llvm::TargetMachine> NVPTXGetTargetMachine(
361     llvm::Triple target_triple, se::CudaComputeCapability compute_capability,
362     const HloModuleConfig& hlo_module_config) {
363   // Figure out the exact name of the processor as known to the NVPTX backend
364   // from the gpu_architecture flag.
365   return GetTargetMachine(target_triple, GetSmName(compute_capability),
366                           hlo_module_config, "+ptx60");
367 }
368 
369 using TargetModuleLinker = std::function<Status(
370     llvm::Module*, GpuVersion, const HloModuleConfig&, const string&)>;
371 
LinkAndOptimizeModule(llvm::Module * module,GpuVersion gpu_version,const HloModuleConfig & hlo_module_config,const string & device_bitcode_dir_path,TargetModuleLinker module_linker,llvm::Triple default_target_triple,llvm::TargetMachine * target_machine,int inline_threshold)372 Status LinkAndOptimizeModule(llvm::Module* module, GpuVersion gpu_version,
373                              const HloModuleConfig& hlo_module_config,
374                              const string& device_bitcode_dir_path,
375                              TargetModuleLinker module_linker,
376                              llvm::Triple default_target_triple,
377                              llvm::TargetMachine* target_machine,
378                              int inline_threshold) {
379   TF_RETURN_IF_ERROR(module_linker(module, gpu_version, hlo_module_config,
380                                    device_bitcode_dir_path));
381 
382   bool dump_ir = hlo_module_config.debug_options().xla_gpu_dump_llvmir();
383   std::string outputs_dir;
384   tensorflow::io::GetTestUndeclaredOutputsDir(&outputs_dir);
385   IrDumpingPassManager module_passes(module->getModuleIdentifier(), outputs_dir,
386                                      dump_ir);
387 
388   // Add an appropriate TargetLibraryInfo pass for the module's triple.
389   llvm::TargetLibraryInfoWrapperPass* tliwp =
390       new llvm::TargetLibraryInfoWrapperPass(
391           llvm::Triple(module->getTargetTriple()));
392   module_passes.add(tliwp);
393 
394   // Try to fetch the target triple from the module. If not present, set a
395   // default target triple.
396   llvm::Triple target_triple = llvm::Triple(module->getTargetTriple());
397   if (target_triple.getArch() == llvm::Triple::UnknownArch) {
398     LOG(WARNING) << "target triple not found in the module";
399     target_triple = default_target_triple;
400   }
401 
402   module_passes.add(llvm::createTargetTransformInfoWrapperPass(
403       target_machine->getTargetIRAnalysis()));
404 
405   // The LLVM IR verifier performs sanity checking on the IR. This helps
406   // discover problems and report them in a meaningful manner, rather than let
407   // later passes report obscure assertions because of unfulfilled invariants.
408   module_passes.add(llvm::createVerifierPass());
409 
410   // Create the function-level pass manager. It needs data layout information
411   // too.
412   llvm::legacy::FunctionPassManager function_passes(module);
413 
414   int32_t opt_level =
415       hlo_module_config.debug_options().xla_backend_optimization_level();
416 
417   if (opt_level < 2) {
418     LOG(ERROR) << std::string(80, '*');
419     LOG(ERROR) << "The XLA GPU backend doesn't support unoptimized code "
420                   "generation but ";
421     LOG(ERROR) << "--xla_backend_optimization_level is set to " << opt_level
422                << "!";
423     LOG(ERROR) << "(Supported configuration is "
424                   "--xla_backend_optimization_level >= 2.)";
425     LOG(ERROR) << std::string(80, '*');
426   }
427 
428   // Add optimization passes, and set inliner threshold.
429   AddOptimizationPasses(opt_level,
430                         /*size_level=*/0, target_machine, &module_passes,
431                         &function_passes, inline_threshold);
432 
433   // Loop unrolling exposes more opportunities for SROA. Therefore, we run SROA
434   // again after the standard optimization passes [http://b/13329423].
435   // TODO(jingyue): SROA may further expose more optimization opportunities such
436   // as more precise alias analysis and more function inlining (SROA may change
437   // the inlining cost of a function). For now, running SROA already emits good
438   // enough code for the evaluated benchmarks. We may want to run more
439   // optimizations later.
440   if (opt_level > 0) {
441     // LLVM's optimizer turns on SROA when the optimization level is greater
442     // than 0. We mimic this behavior here.
443     module_passes.add(llvm::createSROAPass());
444   }
445 
446   // Verify that the module is well formed after optimizations ran.
447   module_passes.add(llvm::createVerifierPass());
448 
449   // Done populating the pass managers. Now run them.
450 
451   function_passes.doInitialization();
452   for (auto func = module->begin(); func != module->end(); ++func) {
453     function_passes.run(*func);
454   }
455   function_passes.doFinalization();
456   module_passes.run(*module);
457 
458   return Status::OK();
459 }
460 
461 // One-time module initializer.
462 // Must be called only once -- DO NOT CALL DIRECTLY.
NVPTXBackendInit(const HloModuleConfig & hlo_module_config)463 void NVPTXBackendInit(const HloModuleConfig& hlo_module_config) {
464   // Feed all customized flags here, so we can override them with llvm_cl_opts
465   // without redeploy the compiler for development purpose.
466 
467   // This flag tunes a threshold in branch folding. The default threshold, which
468   // is one, is not suitable for CUDA programs where branches are more expensive
469   // than for CPU programs. Setting the threshold to 2 improves the latency of
470   // TwoDPatchDotProductKernel_IND_3_ND_48 by over 5%, and does not affect the
471   // latency of other benchmarks so far.
472   //
473   // I also tried setting this threshold to other values:
474   // * 3-6 gives similar results as 2;
475   // * >6 start hurting the performance of at least dot product kernels.
476   //
477   // TODO(jingyue): The current threshold only considers the number of IR
478   // instructions which do not accurately reflect the true cost. We need a
479   // better cost model.
480   FeedLLVMWithFlags({"-bonus-inst-threshold=2"});
481   // Increase limit when scanning memory dependencies.  This helps to reduce
482   // more redundant load instructions.
483   //
484   // The specific value is currently large enough for s3d in shoc benchmark,
485   // which contains a lot of load instructions and many arithmetic instructions
486   // between those loads.
487   FeedLLVMWithFlags({"-memdep-block-scan-limit=500"});
488 
489   // Use div.full -- it matters for some float-division heavy benchmarks.
490   // Using div.approx produces incorrect result for float32(max)/float32(max).
491   FeedLLVMWithFlags({"-nvptx-prec-divf32=1"});
492 
493   llvm_ir::InitializeLLVMCommandLineOptions(hlo_module_config);
494 
495   // Initialize the NVPTX target; it's the only target we link with, so call its
496   // specific initialization functions instead of the catch-all InitializeAll*.
497   LLVMInitializeNVPTXTarget();
498   LLVMInitializeNVPTXTargetInfo();
499   LLVMInitializeNVPTXTargetMC();
500   LLVMInitializeNVPTXAsmPrinter();
501 
502   // Initialize the LLVM optimization passes.
503   llvm::PassRegistry* registry = llvm::PassRegistry::getPassRegistry();
504   InitializePasses(registry);
505 }
506 
507 }  // namespace
508 
509 namespace nvptx {
510 
CompileToPtx(llvm::Module * module,GpuVersion gpu_version,const HloModuleConfig & hlo_module_config,const string & libdevice_dir_path,std::function<void (llvm::TargetMachine *)> configure_target)511 StatusOr<string> CompileToPtx(
512     llvm::Module* module, GpuVersion gpu_version,
513     const HloModuleConfig& hlo_module_config, const string& libdevice_dir_path,
514     std::function<void(llvm::TargetMachine*)> configure_target) {
515   static absl::once_flag backend_init_flag;
516   absl::call_once(backend_init_flag, NVPTXBackendInit, hlo_module_config);
517 
518   string ptx;
519   std::unique_ptr<llvm::TargetMachine> target_machine;
520   {
521     tensorflow::profiler::TraceMe activity(
522         [&] { return absl::StrCat("Compiling IR:", module->getName().str()); },
523         tensorflow::profiler::TraceMeLevel::kInfo);
524     XLA_SCOPED_LOGGING_TIMER("Compile module " + module->getName().str());
525 
526     // If the module has no functions or globals, there's nothing to compile.
527     // Just return an empty string.
528     if (module->empty() && module->global_empty()) {
529       VLOG(2) << "Module '" << module->getName().str()
530               << "' is empty. Skipping compilation.";
531       return string();
532     }
533 
534     auto compute_capability =
535         absl::get_if<se::CudaComputeCapability>(&gpu_version);
536     if (!compute_capability) {
537       return xla::InternalError(
538           "Incompatible compute capability was specified.");
539     }
540 
541     llvm::Triple default_target_triple("nvptx64-unknown-unknown");
542     // Construct LLVM TargetMachine for NVPTX.
543     std::unique_ptr<llvm::TargetMachine> target_machine = NVPTXGetTargetMachine(
544         default_target_triple, *compute_capability, hlo_module_config);
545 
546     // Apply target machine configuration from call-back if available.
547     if (configure_target) {
548       configure_target(target_machine.get());
549     }
550 
551     // Link with libdevice, and optimize the LLVM module.
552     TF_RETURN_IF_ERROR(LinkAndOptimizeModule(
553         module, gpu_version, hlo_module_config, libdevice_dir_path,
554         NVPTXTargetModuleLinker, default_target_triple, target_machine.get(),
555         kDefaultInlineThreshold));
556 
557     // Lower optimized LLVM module to PTX.
558     ptx = EmitModuleToPTX(module, target_machine.get());
559   }
560   return ptx;
561 }
562 
563 }  // namespace nvptx
564 
565 namespace {
566 
567 // Gets the ROCm-Device-Libs filenames for a particular AMDGPU version.
GetROCDLPaths(std::string amdgpu_version,const string & rocdl_dir_path)568 std::vector<string> GetROCDLPaths(std::string amdgpu_version,
569                                   const string& rocdl_dir_path) {
570   // AMDGPU version-neutral bitcodes.
571 #if TF_ROCM_VERSION >= 30900
572   static std::vector<string>* rocdl_filenames = new std::vector<string>(
573       {"hc.bc", "opencl.bc", "ocml.bc", "ockl.bc", "oclc_finite_only_off.bc",
574        "oclc_daz_opt_off.bc", "oclc_correctly_rounded_sqrt_on.bc",
575        "oclc_unsafe_math_off.bc", "oclc_wavefrontsize64_on.bc"});
576 #else
577   static std::vector<string>* rocdl_filenames = new std::vector<string>(
578       {"hc.amdgcn.bc", "opencl.amdgcn.bc", "ocml.amdgcn.bc", "ockl.amdgcn.bc",
579        "oclc_finite_only_off.amdgcn.bc", "oclc_daz_opt_off.amdgcn.bc",
580        "oclc_correctly_rounded_sqrt_on.amdgcn.bc",
581        "oclc_unsafe_math_off.amdgcn.bc", "oclc_wavefrontsize64_on.amdgcn.bc"});
582 #endif
583 
584   // Construct full path to ROCDL bitcode libraries.
585   std::vector<string> result;
586   for (auto& filename : *rocdl_filenames) {
587     result.push_back(tensorflow::io::JoinPath(rocdl_dir_path, filename));
588   }
589 
590   // Add AMDGPU version-specific bitcodes.
591   std::vector<std::string> tokens = absl::StrSplit(amdgpu_version, ':');
592   if (!tokens.empty() && tokens[0].size() >= 3) {
593     amdgpu_version = tokens[0].substr(3);
594   }
595   result.push_back(tensorflow::io::JoinPath(
596       rocdl_dir_path,
597 #if TF_ROCM_VERSION >= 30900
598       absl::StrCat("oclc_isa_version_", amdgpu_version, ".bc")));
599 #else
600       absl::StrCat("oclc_isa_version_", amdgpu_version, ".amdgcn.bc")));
601 #endif
602   return result;
603 }
604 
605 struct HsacoCacheEntry {
606   uint64 hash;
607   std::string ir;
608   std::string gfx;
609   std::vector<uint8> hsaco;
610 };
611 
612 struct HsacoCache {
613  protected:
614   std::vector<HsacoCacheEntry> cache;
615   std::mutex m_mutex;
616   int request_count = 0;
617   int hit_count = 0;
618 
619  public:
620   static bool Find(const std::string& ir, uint64_t& hash,
621                    const std::string& gfx, std::vector<uint8>& hsaco);
622   static void Add(const std::string& ir, uint64_t hash, const std::string& gfx,
623                   const std::vector<uint8>& hsaco);
624 };
625 
626 static HsacoCache g_hsacoCache;
627 
Find(const std::string & ir,uint64_t & hash,const std::string & gfx,std::vector<uint8> & hsaco)628 bool HsacoCache::Find(const std::string& ir, uint64_t& hash,
629                       const std::string& gfx, std::vector<uint8>& hsaco) {
630   std::lock_guard<std::mutex> lg(g_hsacoCache.m_mutex);
631   hash = std::hash<std::string>{}(ir);
632   bool hit = false;
633   for (auto& x : g_hsacoCache.cache) {
634     if (x.hash != hash) continue;
635     if (x.gfx != gfx) continue;
636     if (x.ir != ir) continue;
637     hsaco = x.hsaco;
638     hit = true;
639     break;
640   }
641   g_hsacoCache.request_count++;
642   if (hit) g_hsacoCache.hit_count++;
643   if (!(g_hsacoCache.request_count % 50))
644     VLOG(1) << "HSACO cache: " << g_hsacoCache.request_count << " requests, "
645             << g_hsacoCache.hit_count << " hits";
646   return hit;
647 }
648 
Add(const std::string & ir,uint64_t hash,const std::string & gfx,const std::vector<uint8> & hsaco)649 void HsacoCache::Add(const std::string& ir, uint64_t hash,
650                      const std::string& gfx, const std::vector<uint8>& hsaco) {
651   std::lock_guard<std::mutex> lg(g_hsacoCache.m_mutex);
652   g_hsacoCache.cache.resize(g_hsacoCache.cache.size() + 1);
653   g_hsacoCache.cache.back().ir = ir;
654   g_hsacoCache.cache.back().hash = hash;
655   g_hsacoCache.cache.back().gfx = gfx;
656   g_hsacoCache.cache.back().hsaco = hsaco;
657 }
658 
659 // Emits the given module to HSA Code Object. target_machine is an initialized
660 // TargetMachine for the AMDGPU target.
EmitModuleToHsaco(llvm::Module * module,llvm::TargetMachine * target_machine)661 StatusOr<std::vector<uint8>> EmitModuleToHsaco(
662     llvm::Module* module, llvm::TargetMachine* target_machine) {
663   auto* env = tensorflow::Env::Default();
664   std::vector<std::string> tempdir_vector;
665   env->GetLocalTempDirectories(&tempdir_vector);
666   if (tempdir_vector.empty()) {
667     return xla::InternalError(
668         "Unable to locate a temporary directory for compile-time artifacts.");
669   }
670   std::string tempdir_name = tempdir_vector.front();
671   VLOG(1) << "Compile-time artifacts located at: " << tempdir_name;
672 
673   bool keep_tempfiles = false;
674   TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_ROCM_KEEP_XLA_TEMPFILES",
675                                              /*default_val=*/false,
676                                              &keep_tempfiles));
677   // Prepare filenames for all stages of compilation:
678   // IR, binary ISA, and HSACO.
679   std::string random_number = std::to_string(tensorflow::random::New64());
680   std::string ir_filename =
681       absl::StrCat(module->getModuleIdentifier(), random_number + ".ll");
682   std::string ir_path = tensorflow::io::JoinPath(tempdir_name, ir_filename);
683 
684   std::string ir_opt_filename =
685       absl::StrCat(module->getModuleIdentifier(), random_number + "_opt.ll");
686   std::string ir_opt_path =
687       tensorflow::io::JoinPath(tempdir_name, ir_opt_filename);
688 
689   std::string isabin_filename =
690       absl::StrCat(module->getModuleIdentifier(), random_number + ".o");
691   std::string isabin_path =
692       tensorflow::io::JoinPath(tempdir_name, isabin_filename);
693 
694   std::string hsaco_filename =
695       absl::StrCat(module->getModuleIdentifier(), random_number + ".hsaco");
696   std::string hsaco_path =
697       tensorflow::io::JoinPath(tempdir_name, hsaco_filename);
698 
699   std::error_code ec;
700 
701   // Dump LLVM IR.
702   std::unique_ptr<llvm::raw_fd_ostream> ir_fs(
703       new llvm::raw_fd_ostream(ir_path, ec, llvm::sys::fs::OF_None));
704   module->print(*ir_fs, nullptr);
705   ir_fs->flush();
706 
707   // Emit GCN ISA binary.
708   // The extension is stripped by IrDumpingPassManager, so we need to
709   // get creative to add a suffix.
710   std::string module_id = module->getModuleIdentifier();
711   IrDumpingPassManager codegen_passes(
712       ReplaceFilenameExtension(tensorflow::io::Basename(module_id),
713                                random_number + "-amdgpu.dummy"),
714       "", false);
715   codegen_passes.add(new llvm::TargetLibraryInfoWrapperPass(
716       llvm::Triple(module->getTargetTriple())));
717   llvm::SmallVector<char, 0> stream;
718   llvm::raw_svector_ostream pstream(stream);
719   std::unique_ptr<llvm::raw_fd_ostream> isabin_fs(
720       new llvm::raw_fd_ostream(isabin_path, ec, llvm::sys::fs::OF_Text));
721   module->setDataLayout(target_machine->createDataLayout());
722   target_machine->addPassesToEmitFile(codegen_passes, *isabin_fs, nullptr,
723                                       llvm::CGFT_ObjectFile);
724   codegen_passes.run(*module);
725   isabin_fs->flush();
726 
727   if (keep_tempfiles) {
728     std::unique_ptr<llvm::raw_fd_ostream> ir_fs(
729         new llvm::raw_fd_ostream(ir_opt_path, ec, llvm::sys::fs::OF_None));
730     module->print(*ir_fs, nullptr);
731     ir_fs->flush();
732   }
733   // Locate lld.
734   // TODO(whchung@gmail.com): change to tensorflow::ROCmRoot() after
735   // ROCm-Device-Libs PR.
736   std::string lld_path_1 = tensorflow::io::JoinPath("/opt/rocm", "hcc/bin");
737   std::string lld_path_2 = tensorflow::io::JoinPath("/opt/rocm", "llvm/bin");
738   auto lld_program =
739       llvm::sys::findProgramByName("ld.lld", {lld_path_1, lld_path_2});
740   if (!lld_program) {
741     return xla::InternalError("unable to find ld.lld in PATH: %s",
742                               lld_program.getError().message());
743   }
744   std::vector<llvm::StringRef> lld_args{
745       llvm_ir::AsStringRef("ld.lld"),
746       llvm_ir::AsStringRef("-flavor"),
747       llvm_ir::AsStringRef("gnu"),
748       llvm_ir::AsStringRef("-shared"),
749       llvm_ir::AsStringRef(isabin_path),
750       llvm_ir::AsStringRef("-o"),
751       llvm_ir::AsStringRef(hsaco_path),
752   };
753 
754   std::string error_message;
755   int lld_result =
756       llvm::sys::ExecuteAndWait(*lld_program, llvm_ir::AsArrayRef(lld_args),
757                                 llvm::None, {}, 0, 0, &error_message);
758   if (lld_result) {
759     return xla::InternalError("ld.lld execute fail: %s, error code %d",
760                               error_message, lld_result);
761   }
762 
763   // Read HSACO.
764   std::ifstream hsaco_file(hsaco_path, std::ios::binary | std::ios::ate);
765   std::ifstream::pos_type hsaco_file_size = hsaco_file.tellg();
766 
767   std::vector<uint8> hsaco(hsaco_file_size);
768   hsaco_file.seekg(0, std::ios::beg);
769   hsaco_file.read(reinterpret_cast<char*>(&hsaco[0]), hsaco_file_size);
770   hsaco_file.close();
771   if (!keep_tempfiles) {
772     remove(ir_path.c_str());
773     remove(isabin_path.c_str());
774     remove(hsaco_path.c_str());
775   }
776   return hsaco;
777 }
778 
779 // Links ROCm-Device-Libs into the given module if the module needs it.
LinkROCDLIfNecessary(llvm::Module * module,std::string amdgpu_version,const string & rocdl_dir_path)780 Status LinkROCDLIfNecessary(llvm::Module* module, std::string amdgpu_version,
781                             const string& rocdl_dir_path) {
782   if (!CouldNeedDeviceBitcode(*module)) {
783     return Status::OK();
784   }
785 
786   return LinkWithBitcodeVector(module,
787                                GetROCDLPaths(amdgpu_version, rocdl_dir_path));
788 }
789 
AMDGPUTargetModuleLinker(llvm::Module * module,GpuVersion gpu_version,const HloModuleConfig & hlo_module_config,const string & device_bitcode_dir_path)790 Status AMDGPUTargetModuleLinker(llvm::Module* module, GpuVersion gpu_version,
791                                 const HloModuleConfig& hlo_module_config,
792                                 const string& device_bitcode_dir_path) {
793   // Link the input module with ROCDL.
794   auto amdgpu_version = absl::get_if<std::string>(&gpu_version);
795   if (!amdgpu_version) {
796     return xla::InternalError(
797         "Incompatible AMD GCN ISA version was specified.");
798   }
799   TF_RETURN_IF_ERROR(
800       LinkROCDLIfNecessary(module, *amdgpu_version, device_bitcode_dir_path));
801 
802   // If ftz is enabled, set it as an attribute on every function in the module.
803   if (hlo_module_config.debug_options().xla_gpu_ftz()) {
804     for (llvm::Function& fn : *module) {
805       fn.addFnAttr("denormal-fp-math-f32", "preserve-sign");
806     }
807   }
808 
809   return Status::OK();
810 }
811 
812 // The following routine maps a feature token extracted from the
813 // hipDeviceProp_t::gcnArchName string, and maps it to a valid feature_str
814 // to be used for creating the AMDGPUTarget.
815 // This mapping is currently in a state of flux because TF XLA uses its
816 // own copy of LLVM, which is different from the LLVM version used by
817 // hipcc/runtime in the ROCm install. Ordinarily this is not a problem,
818 // but right now, the LLVM version used by hipcc/runtime has "targetID"
819 // related changes which have not yet been upstreamed (to the LLVM repo)
820 // When that upstreaming happens (and TF LLVM pointer moves past the
821 // upstream commit), the following mapping will need to change
MapGCNArchNameTokenToFeatureStr(const std::string & token)822 std::string MapGCNArchNameTokenToFeatureStr(const std::string& token) {
823   if (token == "sramecc+") {
824     return "+sramecc";
825   } else if (token == "sramecc-") {
826     return "-sramecc";
827   } else if (token == "xnack+") {
828     return "+xnack";
829   } else if (token == "xnack-") {
830     return "-xnack";
831   }
832   return "";
833 }
834 
GetFeatureStrFromGCNArchName(const std::string & gcn_arch_name)835 std::pair<std::string, std::string> GetFeatureStrFromGCNArchName(
836     const std::string& gcn_arch_name) {
837   std::string feature_str;
838 
839   std::string gfx = gcn_arch_name;
840 #if TF_ROCM_VERSION < 30900
841   // For ROCm versions older than 3.9, hardcode it to "+code-object-v3"
842   // This is simply to preserve how things were...nohing else
843   feature_str = "+code-object-v3";
844 #elif TF_ROCM_VERSION < 40000
845   // For ROCM versions 3.9 and 3.10, hardcode it to empty string
846   feature_str = "";
847 #else
848   // For ROCm versions 4.0 and greater, we need to specify the correct
849   // feature str, based on the underlying GPU HW to get max performance.
850   std::vector<std::string> tokens = absl::StrSplit(gcn_arch_name, ':');
851   std::vector<std::string> mapped_tokens;
852   if (tokens.size() > 0) gfx = tokens[0];
853   for (auto it = tokens.begin(); it != tokens.end(); it++) {
854     // Skip the first token, that is the gfxNNN str
855     // The rest of the tokens are the feature/targetid strings
856     if (it != tokens.begin()) {
857       std::string token(*it);
858       std::string mapped_token = MapGCNArchNameTokenToFeatureStr(token);
859       mapped_tokens.push_back(mapped_token);
860     }
861   }
862   feature_str = absl::StrJoin(mapped_tokens, ",");
863 #endif
864 
865   return std::make_pair(gfx, feature_str);
866 }
867 
AMDGPUGetTargetMachine(llvm::Triple target_triple,GpuVersion gpu_version,const HloModuleConfig & hlo_module_config)868 std::unique_ptr<llvm::TargetMachine> AMDGPUGetTargetMachine(
869     llvm::Triple target_triple, GpuVersion gpu_version,
870     const HloModuleConfig& hlo_module_config) {
871   auto amdgpu_version = absl::get_if<std::string>(&gpu_version);
872   std::string gcn_arch_name = *amdgpu_version;
873   auto arch = GetFeatureStrFromGCNArchName(gcn_arch_name);
874   return GetTargetMachine(std::move(target_triple), arch.first,
875                           hlo_module_config, arch.second);
876 }
877 
AMDGPUBackendInit(const HloModuleConfig & hlo_module_config)878 void AMDGPUBackendInit(const HloModuleConfig& hlo_module_config) {
879   llvm_ir::InitializeLLVMCommandLineOptions(hlo_module_config);
880 
881   // Initialize the AMDGPU target; it's the only target we link with, so call
882   // its specific initialization functions instead of the catch-all
883   // InitializeAll*.
884 #if TENSORFLOW_USE_ROCM
885   LLVMInitializeAMDGPUTarget();
886   LLVMInitializeAMDGPUTargetInfo();
887   LLVMInitializeAMDGPUTargetMC();
888   LLVMInitializeAMDGPUAsmPrinter();
889 
890 #if TF_ROCM_VERSION < 40100
891   // Use code-object-v3 for ROCm versions 4.0.1 and lower, since the
892   // HIP runtime for those ROCm versions expects the v3 HSACO objects
893   // Default is now v4 for newer LLVM versions (starting around 210326)
894   FeedLLVMWithFlags({"--amdhsa-code-object-version=3"});
895 #endif
896 
897 #endif
898 
899   llvm::PassRegistry* registry = llvm::PassRegistry::getPassRegistry();
900   InitializePasses(registry);
901 }
902 
903 }  // namespace
904 
905 namespace amdgpu {
CompileToHsaco(llvm::Module * module,GpuVersion gpu_version,const HloModuleConfig & hlo_module_config,const string & rocdl_dir_path)906 StatusOr<std::vector<uint8>> CompileToHsaco(
907     llvm::Module* module, GpuVersion gpu_version,
908     const HloModuleConfig& hlo_module_config, const string& rocdl_dir_path) {
909   static absl::once_flag backend_init_flag;
910   absl::call_once(backend_init_flag, AMDGPUBackendInit, hlo_module_config);
911 
912   std::vector<uint8> hsaco;
913   std::unique_ptr<llvm::TargetMachine> target_machine;
914   std::string str;
915   llvm::raw_string_ostream stream(str);
916   stream << *module;
917   // Delete the first two lines, since they usually vary even when the rest of
918   // the code is the same (but verify that they are what we expect).
919   if (str.size() >= 13 && str.substr(0, 13) == "; ModuleID = ") {
920     auto pos = str.find('\n');
921     if (pos != std::string::npos) str = str.substr(pos + 1);
922   }
923   if (str.size() >= 18 && str.substr(0, 18) == "source_filename = ") {
924     auto pos = str.find('\n');
925     if (pos != std::string::npos) str = str.substr(pos + 1);
926   }
927   str += hlo_module_config.compilation_cache_key();
928   {
929     tensorflow::profiler::TraceMe activity(
930         [&] { return absl::StrCat("Compiling IR", module->getName().str()); },
931         tensorflow::profiler::TraceMeLevel::kInfo);
932     XLA_SCOPED_LOGGING_TIMER("Compile module " + module->getName().str());
933 
934     auto amdgpu_version = absl::get_if<std::string>(&gpu_version);
935     if (!amdgpu_version) {
936       return xla::InternalError(
937           "Incompatible AMD GCN ISA version was specified.");
938     }
939     uint64_t hash;
940     if (HsacoCache::Find(str, hash, *amdgpu_version, hsaco)) {
941       VLOG(1) << "HSACO cache hit";
942       return hsaco;
943     }
944     VLOG(1) << "HSACO cache miss";
945     bool dump_lls = false;
946     if (dump_lls) {
947       static int hsaco_count = 0;
948       std::string name = "/tmp/" + std::to_string(hsaco_count) + ".ll";
949       hsaco_count++;
950       std::ofstream ofs(name);
951       ofs << str;
952       ofs.close();
953     }
954 
955     llvm::Triple default_target_triple("amdgcn--amdhsa-amdgiz");
956     // Construct LLVM TargetMachine for AMDGPU.
957     std::unique_ptr<llvm::TargetMachine> target_machine =
958         AMDGPUGetTargetMachine(default_target_triple, gpu_version,
959                                hlo_module_config);
960 
961     // Link with ROCm-Device-Libs, and optimize the LLVM module.
962     TF_RETURN_IF_ERROR(LinkAndOptimizeModule(
963         module, gpu_version, hlo_module_config, rocdl_dir_path,
964         AMDGPUTargetModuleLinker, default_target_triple, target_machine.get(),
965         kAMDGPUInlineThreshold));
966 
967     // Lower optimized LLVM module to HSA code object.
968     TF_ASSIGN_OR_RETURN(hsaco, EmitModuleToHsaco(module, target_machine.get()));
969     HsacoCache::Add(str, hash, *amdgpu_version, hsaco);
970   }
971   return hsaco;
972 }
973 
974 }  // namespace amdgpu
975 
976 }  // namespace gpu
977 }  // namespace xla
978