• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h"
17 
18 #include <map>
19 #include <memory>
20 #include <string>
21 #include <utility>
22 
23 #include "absl/memory/memory.h"
24 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.h"
25 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.h"
26 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
27 #include "tensorflow/compiler/xla/status_macros.h"
28 #include "tensorflow/compiler/xla/util.h"
29 
30 #include "absl/strings/str_cat.h"
31 #include "absl/strings/string_view.h"
32 #include "llvm/ADT/STLExtras.h"
33 #include "llvm/ADT/StringMap.h"
34 #include "llvm/ADT/StringSet.h"
35 #include "llvm/Analysis/TargetLibraryInfo.h"
36 #include "llvm/Analysis/TargetTransformInfo.h"
37 #include "llvm/Bitcode/BitcodeReader.h"
38 #include "llvm/Bitcode/BitcodeWriter.h"
39 #include "llvm/CodeGen/CommandFlags.inc"
40 #include "llvm/IR/LLVMContext.h"
41 #include "llvm/IR/LegacyPassManager.h"
42 #include "llvm/IR/Module.h"
43 #include "llvm/IR/Verifier.h"
44 #include "llvm/Linker/Linker.h"
45 #include "llvm/PassRegistry.h"
46 #include "llvm/Support/CommandLine.h"
47 #include "llvm/Support/FileSystem.h"
48 #include "llvm/Support/FormattedStream.h"
49 #include "llvm/Support/TargetRegistry.h"
50 #include "llvm/Support/TargetSelect.h"
51 #include "llvm/Support/ToolOutputFile.h"
52 #include "llvm/Target/TargetMachine.h"
53 #include "llvm/Transforms/IPO.h"
54 #include "llvm/Transforms/IPO/AlwaysInliner.h"
55 #include "llvm/Transforms/IPO/Internalize.h"
56 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
57 #include "llvm/Transforms/Scalar.h"
58 #include "tensorflow/compiler/xla/types.h"
59 #include "tensorflow/core/lib/io/path.h"
60 #include "tensorflow/core/platform/env.h"
61 #include "tensorflow/core/platform/logging.h"
62 #include "tensorflow/core/platform/tracing.h"
63 
64 namespace xla {
65 namespace gpu {
66 namespace {
67 
68 // Default inline threshold value to use in llvm.
69 const int kDefaultInlineThreshold = 1100;
70 
71 // Gets the libdevice filename for a particular compute capability.  When
72 // presented with a GPU we don't recognize, we just return the libdevice from
73 // compute_20.
GetLibdeviceFilename(const string & libdevice_dir_path,std::pair<int,int> compute_capability)74 static string GetLibdeviceFilename(const string& libdevice_dir_path,
75                                    std::pair<int, int> compute_capability) {
76   // Since CUDA 9.0, all GPU versions are included in a single file
77   const char* unified_libdevice_filename = "libdevice.10.bc";
78   std::vector<string> unified_libdevice_files;
79   const Status status = tensorflow::Env::Default()->GetMatchingPaths(
80       tensorflow::io::JoinPath(libdevice_dir_path, unified_libdevice_filename),
81       &unified_libdevice_files);
82   if (status.ok() && unified_libdevice_files.size() == 1) {
83     return unified_libdevice_filename;
84   }
85   // There are only four libdevice files: compute_{20,30,35,50}.  Each GPU
86   // version gets mapped to one of these.  Note in particular that sm_60 and
87   // sm_61 map to libdevice.compute_30.
88   static auto* m = new std::map<std::pair<int, int>, int>({{{2, 0}, 20},
89                                                            {{2, 1}, 20},
90                                                            {{3, 0}, 30},
91                                                            {{3, 2}, 30},
92                                                            {{3, 5}, 35},
93                                                            {{3, 7}, 35},
94                                                            {{5, 0}, 50},
95                                                            {{5, 2}, 50},
96                                                            {{5, 3}, 50},
97                                                            {{6, 0}, 30},
98                                                            {{6, 1}, 30},
99                                                            {{6, 2}, 30}});
100   int libdevice_version = 20;
101   auto it = m->find(compute_capability);
102   if (it != m->end()) {
103     libdevice_version = it->second;
104   } else {
105     LOG(WARNING) << "Unknown compute capability (" << compute_capability.first
106                  << ", " << compute_capability.second << ") ."
107                  << "Defaulting to libdevice for compute_" << libdevice_version;
108   }
109   return absl::StrCat("libdevice.compute_", libdevice_version, ".10.bc");
110 }
111 
112 // Gets the GPU name as it's known to LLVM for a given compute capability.  If
113 // we see an unrecognized compute capability, we return "sm_35".
GetSmName(std::pair<int,int> compute_capability)114 static string GetSmName(std::pair<int, int> compute_capability) {
115   static auto* m = new std::map<std::pair<int, int>, int>({
116       {{3, 5}, 35},
117       {{3, 7}, 37},
118       {{5, 0}, 50},
119       {{5, 2}, 52},
120       {{5, 3}, 53},
121       {{6, 0}, 60},
122       {{6, 1}, 61},
123       {{6, 2}, 62},
124       {{7, 0}, 70},
125       {{7, 2}, 72},
126       {{7, 5}, 75},
127   });
128   int sm_version = 35;
129   auto it = m->find(compute_capability);
130   if (it != m->end()) {
131     sm_version = it->second;
132   } else {
133     LOG(WARNING) << "Unknown compute capability (" << compute_capability.first
134                  << ", " << compute_capability.second << ") ."
135                  << "Defaulting to telling LLVM that we're compiling for sm_"
136                  << sm_version;
137   }
138   return absl::StrCat("sm_", sm_version);
139 }
140 
141 // Convenience function for producing a name of a temporary compilation product
142 // from the input filename.
MakeNameForTempProduct(absl::string_view input_filename,absl::string_view extension)143 string MakeNameForTempProduct(absl::string_view input_filename,
144                               absl::string_view extension) {
145   return ReplaceFilenameExtension(tensorflow::io::Basename(input_filename),
146                                   extension);
147 }
148 
149 // Initializes LLVM passes. Uses the PassRegistry mechanism.
InitializePasses(llvm::PassRegistry * pass_registry)150 void InitializePasses(llvm::PassRegistry* pass_registry) {
151   llvm::initializeCore(*pass_registry);
152   llvm::initializeCodeGen(*pass_registry);
153   llvm::initializeScalarOpts(*pass_registry);
154   llvm::initializeObjCARCOpts(*pass_registry);
155   llvm::initializeVectorization(*pass_registry);
156   llvm::initializeIPO(*pass_registry);
157   llvm::initializeAnalysis(*pass_registry);
158   llvm::initializeTransformUtils(*pass_registry);
159   llvm::initializeInstCombine(*pass_registry);
160   llvm::initializeInstrumentation(*pass_registry);
161   llvm::initializeTarget(*pass_registry);
162   llvm::initializeCodeGenPreparePass(*pass_registry);
163 }
164 
165 // Returns the TargetMachine, given a triple.
GetTargetMachine(llvm::Triple triple,absl::string_view cpu_name,const HloModuleConfig & hlo_module_config)166 std::unique_ptr<llvm::TargetMachine> GetTargetMachine(
167     llvm::Triple triple, absl::string_view cpu_name,
168     const HloModuleConfig& hlo_module_config) {
169   std::string error;
170   const llvm::Target* target = TargetRegistry::lookupTarget("", triple, error);
171   if (target == nullptr) {
172     LOG(FATAL) << "Unable to find Target for triple '" << triple.str() << "'"
173                << " -- " << error;
174     return nullptr;
175   }
176 
177   TargetOptions target_options = InitTargetOptionsFromCodeGenFlags();
178 
179   // Set the verbose assembly options.
180   target_options.MCOptions.AsmVerbose = false;
181 
182   // The selection of codegen optimization level is copied from function
183   // GetCodeGenOptLevel in //third_party/llvm/llvm/tools/opt/opt.cpp.
184   CodeGenOpt::Level codegen_opt_level;
185   switch (hlo_module_config.debug_options().xla_backend_optimization_level()) {
186     case 1:
187       codegen_opt_level = CodeGenOpt::Less;
188       break;
189     case 2:
190       codegen_opt_level = CodeGenOpt::Default;
191       break;
192     case 3:
193       codegen_opt_level = CodeGenOpt::Aggressive;
194       break;
195     default:
196       codegen_opt_level = CodeGenOpt::None;
197   }
198   return absl::WrapUnique(target->createTargetMachine(
199       triple.str(), llvm_ir::AsStringRef(cpu_name), "+ptx60", target_options,
200       getRelocModel(), getCodeModel(), codegen_opt_level));
201 }
202 
203 // Adds the standard LLVM optimization passes, based on the speed optimization
204 // level (opt_level) and size optimization level (size_level). Both module
205 // and function-level passes are added, so two pass managers are passed in and
206 // modified by this function.
AddOptimizationPasses(unsigned opt_level,unsigned size_level,llvm::TargetMachine * target_machine,llvm::legacy::PassManagerBase * module_passes,llvm::legacy::FunctionPassManager * function_passes)207 void AddOptimizationPasses(unsigned opt_level, unsigned size_level,
208                            llvm::TargetMachine* target_machine,
209                            llvm::legacy::PassManagerBase* module_passes,
210                            llvm::legacy::FunctionPassManager* function_passes) {
211   PassManagerBuilder builder;
212   builder.OptLevel = opt_level;
213   builder.SizeLevel = size_level;
214 
215   if (opt_level > 1) {
216     builder.Inliner = llvm::createFunctionInliningPass(kDefaultInlineThreshold);
217   } else {
218     // Only inline functions marked with "alwaysinline".
219     builder.Inliner = llvm::createAlwaysInlinerLegacyPass();
220   }
221 
222   builder.DisableUnitAtATime = false;
223   builder.DisableUnrollLoops = opt_level == 0;
224   builder.LoopVectorize = opt_level > 0;
225   builder.SLPVectorize = opt_level > 1 && size_level < 2;
226 
227   // NVPTX's early-as-possible passes include NVVM reflect.
228   target_machine->adjustPassManager(builder);
229 
230   builder.populateFunctionPassManager(*function_passes);
231   builder.populateModulePassManager(*module_passes);
232 }
233 
234 // Emits the given module to a bit code file.
EmitBitcodeToFile(const Module & module,absl::string_view filename)235 void EmitBitcodeToFile(const Module& module, absl::string_view filename) {
236   std::error_code error_code;
237   llvm::ToolOutputFile outfile(string(filename).c_str(), error_code,
238                                llvm::sys::fs::F_None);
239   if (error_code) {
240     LOG(FATAL) << "opening bitcode file for writing: " << error_code.message();
241   }
242 
243   llvm::WriteBitcodeToFile(module, outfile.os());
244   outfile.keep();
245 }
246 
247 // Emits the given module to PTX. target_machine is an initialized TargetMachine
248 // for the NVPTX target.
EmitModuleToPTX(Module * module,llvm::TargetMachine * target_machine)249 string EmitModuleToPTX(Module* module, llvm::TargetMachine* target_machine) {
250   std::string ptx;  // need a std::string instead of a ::string.
251   {
252     llvm::raw_string_ostream stream(ptx);
253     llvm::buffer_ostream pstream(stream);
254     // The extension is stripped by IrDumpingPassManager, so we need to
255     // get creative to add a suffix.
256     IrDumpingPassManager codegen_passes(
257         MakeNameForTempProduct(module->getModuleIdentifier(), "-nvptx.dummy"),
258         "", false);
259     codegen_passes.add(new llvm::TargetLibraryInfoWrapperPass(
260         llvm::Triple(module->getTargetTriple())));
261 
262     target_machine->addPassesToEmitFile(codegen_passes, pstream, nullptr,
263                                         llvm::TargetMachine::CGFT_AssemblyFile);
264     codegen_passes.run(*module);
265   }
266 
267   return ptx;
268 }
269 
270 // LLVM has an extensive flags mechanism of its own, which is only accessible
271 // through the command line. Internal libraries within LLVM register parsers for
272 // flags, with no other way to configure them except pass these flags.
273 // To do this programmatically, we invoke ParseCommandLineOptions manually with
274 // a "fake argv".
275 // Note: setting flags with this method is stateful, since flags are just
276 // static globals within LLVM libraries.
FeedLLVMWithFlags(const std::vector<string> & cl_opts)277 void FeedLLVMWithFlags(const std::vector<string>& cl_opts) {
278   std::vector<const char*> fake_argv = {""};
279   for (const string& cl_opt : cl_opts) {
280     fake_argv.push_back(cl_opt.c_str());
281   }
282   llvm::cl::ParseCommandLineOptions(fake_argv.size(), &fake_argv[0]);
283 }
284 
285 // Returns whether the module could use any libdevice functions. This function
286 // may have false positives -- the module might not use libdevice even if this
287 // function returns true.
CouldNeedLibdevice(const llvm::Module & module)288 bool CouldNeedLibdevice(const llvm::Module& module) {
289   for (const llvm::Function& function : module.functions()) {
290     // This is a conservative approximation -- not all such functions are in
291     // libdevice.
292     if (!function.isIntrinsic() && function.isDeclaration()) {
293       return true;
294     }
295   }
296   return false;
297 }
298 
299 // Links libdevice into the given module if the module needs libdevice.
LinkLibdeviceIfNecessary(llvm::Module * module,std::pair<int,int> compute_capability,const string & libdevice_dir_path)300 Status LinkLibdeviceIfNecessary(llvm::Module* module,
301                                 std::pair<int, int> compute_capability,
302                                 const string& libdevice_dir_path) {
303   if (!CouldNeedLibdevice(*module)) {
304     return Status::OK();
305   }
306 
307   llvm::Linker linker(*module);
308   string libdevice_path = tensorflow::io::JoinPath(
309       libdevice_dir_path,
310       GetLibdeviceFilename(libdevice_dir_path, compute_capability));
311   TF_RETURN_IF_ERROR(tensorflow::Env::Default()->FileExists(libdevice_path));
312   VLOG(1) << "Linking with libdevice from: " << libdevice_path;
313   std::unique_ptr<llvm::Module> libdevice_module =
314       LoadIRModule(libdevice_path, &module->getContext());
315   if (linker.linkInModule(
316           std::move(libdevice_module), llvm::Linker::Flags::LinkOnlyNeeded,
317           [](Module& M, const StringSet<>& GVS) {
318             internalizeModule(M, [&GVS](const GlobalValue& GV) {
319               return !GV.hasName() || (GVS.count(GV.getName()) == 0);
320             });
321           })) {
322     return tensorflow::errors::Internal(
323         absl::StrCat("Error linking libdevice from ", libdevice_path));
324   }
325   return Status::OK();
326 }
327 
CompileModuleToPtx(llvm::Module * module,std::pair<int,int> compute_capability,const HloModuleConfig & hlo_module_config,const string & libdevice_dir_path)328 StatusOr<string> CompileModuleToPtx(llvm::Module* module,
329                                     std::pair<int, int> compute_capability,
330                                     const HloModuleConfig& hlo_module_config,
331                                     const string& libdevice_dir_path) {
332   // If the module has no functions or globals, there's nothing to compile. Just
333   // return an empty string.
334   if (module->empty() && module->global_empty()) {
335     VLOG(2) << "Module '" << module->getName().str()
336             << "' is empty. Skipping compilation.";
337     return string();
338   }
339   // Link the input module with libdevice, to pull in implementations of some
340   // builtins.
341   TF_RETURN_IF_ERROR(
342       LinkLibdeviceIfNecessary(module, compute_capability, libdevice_dir_path));
343 
344   // Set the flush-denormals-to-zero flag on the module so the NVVM reflect pass
345   // can access it.
346   module->addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz",
347                         hlo_module_config.debug_options().xla_gpu_ftz());
348 
349   // If ftz is enabled, set it as an attribute on every function in the module.
350   if (hlo_module_config.debug_options().xla_gpu_ftz()) {
351     for (llvm::Function& fn : *module) {
352       fn.addFnAttr("nvptx-f32ftz", "true");
353     }
354   }
355 
356   IrDumpingPassManager module_passes(module->getModuleIdentifier(), "", false);
357 
358   // Add an appropriate TargetLibraryInfo pass for the module's triple.
359   llvm::TargetLibraryInfoWrapperPass* tliwp =
360       new llvm::TargetLibraryInfoWrapperPass(
361           llvm::Triple(module->getTargetTriple()));
362   module_passes.add(tliwp);
363 
364   // Try to fetch the target triple from the module. If not present, set a
365   // default target triple.
366   llvm::Triple target_triple = llvm::Triple(module->getTargetTriple());
367   if (target_triple.getArch() == llvm::Triple::UnknownArch) {
368     LOG(WARNING) << "target triple not found in the module";
369     target_triple = llvm::Triple("nvptx64-unknown-unknown");
370   }
371 
372   // Figure out the exact name of the processor as known to the NVPTX backend
373   // from the gpu_architecture flag.
374   std::unique_ptr<llvm::TargetMachine> target_machine = GetTargetMachine(
375       target_triple, GetSmName(compute_capability), hlo_module_config);
376   module_passes.add(llvm::createTargetTransformInfoWrapperPass(
377       target_machine->getTargetIRAnalysis()));
378 
379   // The LLVM IR verifier performs sanity checking on the IR. This helps
380   // discover problems and report them in a meaningful manner, rather than let
381   // later passes report obscure assertions because of unfulfilled invariants.
382   module_passes.add(llvm::createVerifierPass());
383 
384   // Create the function-level pass manager. It needs data layout information
385   // too.
386   llvm::legacy::FunctionPassManager function_passes(module);
387 
388   int32 opt_level =
389       hlo_module_config.debug_options().xla_backend_optimization_level();
390 
391   if (opt_level < 2) {
392     LOG(ERROR) << std::string(80, '*');
393     LOG(ERROR) << "The XLA GPU backend doesn't support unoptimized code "
394                   "generation but ";
395     LOG(ERROR) << "--xla_backend_optimization_level is set to " << opt_level
396                << "!";
397     LOG(ERROR) << "(Supported configuration is "
398                   "--xla_backend_optimization_level >= 2.)";
399     LOG(ERROR) << std::string(80, '*');
400   }
401 
402   AddOptimizationPasses(opt_level,
403                         /*size_level=*/0, target_machine.get(), &module_passes,
404                         &function_passes);
405 
406   // Loop unrolling exposes more opportunities for SROA. Therefore, we run SROA
407   // again after the standard optimization passes [http://b/13329423].
408   // TODO(jingyue): SROA may further expose more optimization opportunities such
409   // as more precise alias analysis and more function inlining (SROA may change
410   // the inlining cost of a function). For now, running SROA already emits good
411   // enough code for the evaluated benchmarks. We may want to run more
412   // optimizations later.
413   if (opt_level > 0) {
414     // LLVM's optimizer turns on SROA when the optimization level is greater
415     // than 0. We mimic this behavior here.
416     module_passes.add(llvm::createSROAPass());
417   }
418 
419   // Verify that the module is well formed after optimizations ran.
420   module_passes.add(llvm::createVerifierPass());
421 
422   // Done populating the pass managers. Now run them.
423 
424   function_passes.doInitialization();
425   for (auto func = module->begin(); func != module->end(); ++func) {
426     function_passes.run(*func);
427   }
428   function_passes.doFinalization();
429   module_passes.run(*module);
430 
431   // Finally, produce PTX.
432   return EmitModuleToPTX(module, target_machine.get());
433 }
434 
435 // One-time module initializer.
436 // Must be called only once -- DO NOT CALL DIRECTLY.
GPUBackendInit(const HloModuleConfig & hlo_module_config)437 void GPUBackendInit(const HloModuleConfig& hlo_module_config) {
438   // Feed all customized flags here, so we can override them with llvm_cl_opts
439   // without redeploy the compiler for development purpose.
440 
441   // This flag tunes a threshold in branch folding. The default threshold, which
442   // is one, is not suitable for CUDA programs where branches are more expensive
443   // than for CPU programs. Setting the threshold to 2 improves the latency of
444   // TwoDPatchDotProductKernel_IND_3_ND_48 by over 5%, and does not affect the
445   // latency of other benchmarks so far.
446   //
447   // I also tried setting this threshold to other values:
448   // * 3-6 gives similar results as 2;
449   // * >6 start hurting the performance of at least dot product kernels.
450   //
451   // TODO(jingyue): The current threshold only considers the number of IR
452   // instructions which do not accurately reflect the true cost. We need a
453   // better cost model.
454   FeedLLVMWithFlags({"-bonus-inst-threshold=2"});
455   // Increase limit when scanning memory dependencies.  This helps to reduce
456   // more redundant load instructions.
457   //
458   // The specific value is currently large enough for s3d in shoc benchmark,
459   // which contains a lot of load instructions and many arithmetic instructions
460   // between those loads.
461   FeedLLVMWithFlags({"-memdep-block-scan-limit=500"});
462 
463   // Use div.approx -- it matters for some float-division heavy benchmarks.
464   FeedLLVMWithFlags({"-nvptx-prec-divf32=0"});
465 
466   llvm_ir::InitializeLLVMCommandLineOptions(hlo_module_config);
467 
468   // Initialize the NVPTX target; it's the only target we link with, so call its
469   // specific initialization functions instead of the catch-all InitializeAll*.
470   LLVMInitializeNVPTXTarget();
471   LLVMInitializeNVPTXTargetInfo();
472   LLVMInitializeNVPTXTargetMC();
473   LLVMInitializeNVPTXAsmPrinter();
474 
475   // Initialize the LLVM optimization passes.
476   llvm::PassRegistry* registry = llvm::PassRegistry::getPassRegistry();
477   InitializePasses(registry);
478 }
479 
480 }  // namespace
481 
CompileToPtx(llvm::Module * module,std::pair<int,int> compute_capability,const HloModuleConfig & hlo_module_config,const string & libdevice_dir_path)482 StatusOr<string> CompileToPtx(llvm::Module* module,
483                               std::pair<int, int> compute_capability,
484                               const HloModuleConfig& hlo_module_config,
485                               const string& libdevice_dir_path) {
486   static std::once_flag backend_init_flag;
487   std::call_once(backend_init_flag, GPUBackendInit, hlo_module_config);
488 
489   string ptx;
490   {
491     tensorflow::tracing::ScopedActivity activity("Compiling IR",
492                                                  module->getName().str(),
493                                                  /*is_expensive=*/true);
494     XLA_SCOPED_LOGGING_TIMER("Compile module " + module->getName().str());
495     TF_ASSIGN_OR_RETURN(
496         ptx, CompileModuleToPtx(module, compute_capability, hlo_module_config,
497                                 libdevice_dir_path));
498   }
499   return ptx;
500 }
501 
502 }  // namespace gpu
503 }  // namespace xla
504