1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h"
17
18 #include <map>
19 #include <memory>
20 #include <string>
21 #include <utility>
22
23 #include "absl/memory/memory.h"
24 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/dump_ir_pass.h"
25 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.h"
26 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
27 #include "tensorflow/compiler/xla/status_macros.h"
28 #include "tensorflow/compiler/xla/util.h"
29
30 #include "absl/strings/str_cat.h"
31 #include "absl/strings/string_view.h"
32 #include "llvm/ADT/STLExtras.h"
33 #include "llvm/ADT/StringMap.h"
34 #include "llvm/ADT/StringSet.h"
35 #include "llvm/Analysis/TargetLibraryInfo.h"
36 #include "llvm/Analysis/TargetTransformInfo.h"
37 #include "llvm/Bitcode/BitcodeReader.h"
38 #include "llvm/Bitcode/BitcodeWriter.h"
39 #include "llvm/CodeGen/CommandFlags.inc"
40 #include "llvm/IR/LLVMContext.h"
41 #include "llvm/IR/LegacyPassManager.h"
42 #include "llvm/IR/Module.h"
43 #include "llvm/IR/Verifier.h"
44 #include "llvm/Linker/Linker.h"
45 #include "llvm/PassRegistry.h"
46 #include "llvm/Support/CommandLine.h"
47 #include "llvm/Support/FileSystem.h"
48 #include "llvm/Support/FormattedStream.h"
49 #include "llvm/Support/TargetRegistry.h"
50 #include "llvm/Support/TargetSelect.h"
51 #include "llvm/Support/ToolOutputFile.h"
52 #include "llvm/Target/TargetMachine.h"
53 #include "llvm/Transforms/IPO.h"
54 #include "llvm/Transforms/IPO/AlwaysInliner.h"
55 #include "llvm/Transforms/IPO/Internalize.h"
56 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
57 #include "llvm/Transforms/Scalar.h"
58 #include "tensorflow/compiler/xla/types.h"
59 #include "tensorflow/core/lib/io/path.h"
60 #include "tensorflow/core/platform/env.h"
61 #include "tensorflow/core/platform/logging.h"
62 #include "tensorflow/core/platform/tracing.h"
63
64 namespace xla {
65 namespace gpu {
66 namespace {
67
68 // Default inline threshold value to use in llvm.
69 const int kDefaultInlineThreshold = 1100;
70
71 // Gets the libdevice filename for a particular compute capability. When
72 // presented with a GPU we don't recognize, we just return the libdevice from
73 // compute_20.
GetLibdeviceFilename(const string & libdevice_dir_path,std::pair<int,int> compute_capability)74 static string GetLibdeviceFilename(const string& libdevice_dir_path,
75 std::pair<int, int> compute_capability) {
76 // Since CUDA 9.0, all GPU versions are included in a single file
77 const char* unified_libdevice_filename = "libdevice.10.bc";
78 std::vector<string> unified_libdevice_files;
79 const Status status = tensorflow::Env::Default()->GetMatchingPaths(
80 tensorflow::io::JoinPath(libdevice_dir_path, unified_libdevice_filename),
81 &unified_libdevice_files);
82 if (status.ok() && unified_libdevice_files.size() == 1) {
83 return unified_libdevice_filename;
84 }
85 // There are only four libdevice files: compute_{20,30,35,50}. Each GPU
86 // version gets mapped to one of these. Note in particular that sm_60 and
87 // sm_61 map to libdevice.compute_30.
88 static auto* m = new std::map<std::pair<int, int>, int>({{{2, 0}, 20},
89 {{2, 1}, 20},
90 {{3, 0}, 30},
91 {{3, 2}, 30},
92 {{3, 5}, 35},
93 {{3, 7}, 35},
94 {{5, 0}, 50},
95 {{5, 2}, 50},
96 {{5, 3}, 50},
97 {{6, 0}, 30},
98 {{6, 1}, 30},
99 {{6, 2}, 30}});
100 int libdevice_version = 20;
101 auto it = m->find(compute_capability);
102 if (it != m->end()) {
103 libdevice_version = it->second;
104 } else {
105 LOG(WARNING) << "Unknown compute capability (" << compute_capability.first
106 << ", " << compute_capability.second << ") ."
107 << "Defaulting to libdevice for compute_" << libdevice_version;
108 }
109 return absl::StrCat("libdevice.compute_", libdevice_version, ".10.bc");
110 }
111
112 // Gets the GPU name as it's known to LLVM for a given compute capability. If
113 // we see an unrecognized compute capability, we return "sm_35".
GetSmName(std::pair<int,int> compute_capability)114 static string GetSmName(std::pair<int, int> compute_capability) {
115 static auto* m = new std::map<std::pair<int, int>, int>({
116 {{3, 5}, 35},
117 {{3, 7}, 37},
118 {{5, 0}, 50},
119 {{5, 2}, 52},
120 {{5, 3}, 53},
121 {{6, 0}, 60},
122 {{6, 1}, 61},
123 {{6, 2}, 62},
124 {{7, 0}, 70},
125 {{7, 2}, 72},
126 {{7, 5}, 75},
127 });
128 int sm_version = 35;
129 auto it = m->find(compute_capability);
130 if (it != m->end()) {
131 sm_version = it->second;
132 } else {
133 LOG(WARNING) << "Unknown compute capability (" << compute_capability.first
134 << ", " << compute_capability.second << ") ."
135 << "Defaulting to telling LLVM that we're compiling for sm_"
136 << sm_version;
137 }
138 return absl::StrCat("sm_", sm_version);
139 }
140
141 // Convenience function for producing a name of a temporary compilation product
142 // from the input filename.
MakeNameForTempProduct(absl::string_view input_filename,absl::string_view extension)143 string MakeNameForTempProduct(absl::string_view input_filename,
144 absl::string_view extension) {
145 return ReplaceFilenameExtension(tensorflow::io::Basename(input_filename),
146 extension);
147 }
148
149 // Initializes LLVM passes. Uses the PassRegistry mechanism.
InitializePasses(llvm::PassRegistry * pass_registry)150 void InitializePasses(llvm::PassRegistry* pass_registry) {
151 llvm::initializeCore(*pass_registry);
152 llvm::initializeCodeGen(*pass_registry);
153 llvm::initializeScalarOpts(*pass_registry);
154 llvm::initializeObjCARCOpts(*pass_registry);
155 llvm::initializeVectorization(*pass_registry);
156 llvm::initializeIPO(*pass_registry);
157 llvm::initializeAnalysis(*pass_registry);
158 llvm::initializeTransformUtils(*pass_registry);
159 llvm::initializeInstCombine(*pass_registry);
160 llvm::initializeInstrumentation(*pass_registry);
161 llvm::initializeTarget(*pass_registry);
162 llvm::initializeCodeGenPreparePass(*pass_registry);
163 }
164
165 // Returns the TargetMachine, given a triple.
GetTargetMachine(llvm::Triple triple,absl::string_view cpu_name,const HloModuleConfig & hlo_module_config)166 std::unique_ptr<llvm::TargetMachine> GetTargetMachine(
167 llvm::Triple triple, absl::string_view cpu_name,
168 const HloModuleConfig& hlo_module_config) {
169 std::string error;
170 const llvm::Target* target = TargetRegistry::lookupTarget("", triple, error);
171 if (target == nullptr) {
172 LOG(FATAL) << "Unable to find Target for triple '" << triple.str() << "'"
173 << " -- " << error;
174 return nullptr;
175 }
176
177 TargetOptions target_options = InitTargetOptionsFromCodeGenFlags();
178
179 // Set the verbose assembly options.
180 target_options.MCOptions.AsmVerbose = false;
181
182 // The selection of codegen optimization level is copied from function
183 // GetCodeGenOptLevel in //third_party/llvm/llvm/tools/opt/opt.cpp.
184 CodeGenOpt::Level codegen_opt_level;
185 switch (hlo_module_config.debug_options().xla_backend_optimization_level()) {
186 case 1:
187 codegen_opt_level = CodeGenOpt::Less;
188 break;
189 case 2:
190 codegen_opt_level = CodeGenOpt::Default;
191 break;
192 case 3:
193 codegen_opt_level = CodeGenOpt::Aggressive;
194 break;
195 default:
196 codegen_opt_level = CodeGenOpt::None;
197 }
198 return absl::WrapUnique(target->createTargetMachine(
199 triple.str(), llvm_ir::AsStringRef(cpu_name), "+ptx60", target_options,
200 getRelocModel(), getCodeModel(), codegen_opt_level));
201 }
202
203 // Adds the standard LLVM optimization passes, based on the speed optimization
204 // level (opt_level) and size optimization level (size_level). Both module
205 // and function-level passes are added, so two pass managers are passed in and
206 // modified by this function.
AddOptimizationPasses(unsigned opt_level,unsigned size_level,llvm::TargetMachine * target_machine,llvm::legacy::PassManagerBase * module_passes,llvm::legacy::FunctionPassManager * function_passes)207 void AddOptimizationPasses(unsigned opt_level, unsigned size_level,
208 llvm::TargetMachine* target_machine,
209 llvm::legacy::PassManagerBase* module_passes,
210 llvm::legacy::FunctionPassManager* function_passes) {
211 PassManagerBuilder builder;
212 builder.OptLevel = opt_level;
213 builder.SizeLevel = size_level;
214
215 if (opt_level > 1) {
216 builder.Inliner = llvm::createFunctionInliningPass(kDefaultInlineThreshold);
217 } else {
218 // Only inline functions marked with "alwaysinline".
219 builder.Inliner = llvm::createAlwaysInlinerLegacyPass();
220 }
221
222 builder.DisableUnitAtATime = false;
223 builder.DisableUnrollLoops = opt_level == 0;
224 builder.LoopVectorize = opt_level > 0;
225 builder.SLPVectorize = opt_level > 1 && size_level < 2;
226
227 // NVPTX's early-as-possible passes include NVVM reflect.
228 target_machine->adjustPassManager(builder);
229
230 builder.populateFunctionPassManager(*function_passes);
231 builder.populateModulePassManager(*module_passes);
232 }
233
234 // Emits the given module to a bit code file.
EmitBitcodeToFile(const Module & module,absl::string_view filename)235 void EmitBitcodeToFile(const Module& module, absl::string_view filename) {
236 std::error_code error_code;
237 llvm::ToolOutputFile outfile(string(filename).c_str(), error_code,
238 llvm::sys::fs::F_None);
239 if (error_code) {
240 LOG(FATAL) << "opening bitcode file for writing: " << error_code.message();
241 }
242
243 llvm::WriteBitcodeToFile(module, outfile.os());
244 outfile.keep();
245 }
246
247 // Emits the given module to PTX. target_machine is an initialized TargetMachine
248 // for the NVPTX target.
EmitModuleToPTX(Module * module,llvm::TargetMachine * target_machine)249 string EmitModuleToPTX(Module* module, llvm::TargetMachine* target_machine) {
250 std::string ptx; // need a std::string instead of a ::string.
251 {
252 llvm::raw_string_ostream stream(ptx);
253 llvm::buffer_ostream pstream(stream);
254 // The extension is stripped by IrDumpingPassManager, so we need to
255 // get creative to add a suffix.
256 IrDumpingPassManager codegen_passes(
257 MakeNameForTempProduct(module->getModuleIdentifier(), "-nvptx.dummy"),
258 "", false);
259 codegen_passes.add(new llvm::TargetLibraryInfoWrapperPass(
260 llvm::Triple(module->getTargetTriple())));
261
262 target_machine->addPassesToEmitFile(codegen_passes, pstream, nullptr,
263 llvm::TargetMachine::CGFT_AssemblyFile);
264 codegen_passes.run(*module);
265 }
266
267 return ptx;
268 }
269
270 // LLVM has an extensive flags mechanism of its own, which is only accessible
271 // through the command line. Internal libraries within LLVM register parsers for
272 // flags, with no other way to configure them except pass these flags.
273 // To do this programmatically, we invoke ParseCommandLineOptions manually with
274 // a "fake argv".
275 // Note: setting flags with this method is stateful, since flags are just
276 // static globals within LLVM libraries.
FeedLLVMWithFlags(const std::vector<string> & cl_opts)277 void FeedLLVMWithFlags(const std::vector<string>& cl_opts) {
278 std::vector<const char*> fake_argv = {""};
279 for (const string& cl_opt : cl_opts) {
280 fake_argv.push_back(cl_opt.c_str());
281 }
282 llvm::cl::ParseCommandLineOptions(fake_argv.size(), &fake_argv[0]);
283 }
284
285 // Returns whether the module could use any libdevice functions. This function
286 // may have false positives -- the module might not use libdevice even if this
287 // function returns true.
CouldNeedLibdevice(const llvm::Module & module)288 bool CouldNeedLibdevice(const llvm::Module& module) {
289 for (const llvm::Function& function : module.functions()) {
290 // This is a conservative approximation -- not all such functions are in
291 // libdevice.
292 if (!function.isIntrinsic() && function.isDeclaration()) {
293 return true;
294 }
295 }
296 return false;
297 }
298
299 // Links libdevice into the given module if the module needs libdevice.
LinkLibdeviceIfNecessary(llvm::Module * module,std::pair<int,int> compute_capability,const string & libdevice_dir_path)300 Status LinkLibdeviceIfNecessary(llvm::Module* module,
301 std::pair<int, int> compute_capability,
302 const string& libdevice_dir_path) {
303 if (!CouldNeedLibdevice(*module)) {
304 return Status::OK();
305 }
306
307 llvm::Linker linker(*module);
308 string libdevice_path = tensorflow::io::JoinPath(
309 libdevice_dir_path,
310 GetLibdeviceFilename(libdevice_dir_path, compute_capability));
311 TF_RETURN_IF_ERROR(tensorflow::Env::Default()->FileExists(libdevice_path));
312 VLOG(1) << "Linking with libdevice from: " << libdevice_path;
313 std::unique_ptr<llvm::Module> libdevice_module =
314 LoadIRModule(libdevice_path, &module->getContext());
315 if (linker.linkInModule(
316 std::move(libdevice_module), llvm::Linker::Flags::LinkOnlyNeeded,
317 [](Module& M, const StringSet<>& GVS) {
318 internalizeModule(M, [&GVS](const GlobalValue& GV) {
319 return !GV.hasName() || (GVS.count(GV.getName()) == 0);
320 });
321 })) {
322 return tensorflow::errors::Internal(
323 absl::StrCat("Error linking libdevice from ", libdevice_path));
324 }
325 return Status::OK();
326 }
327
CompileModuleToPtx(llvm::Module * module,std::pair<int,int> compute_capability,const HloModuleConfig & hlo_module_config,const string & libdevice_dir_path)328 StatusOr<string> CompileModuleToPtx(llvm::Module* module,
329 std::pair<int, int> compute_capability,
330 const HloModuleConfig& hlo_module_config,
331 const string& libdevice_dir_path) {
332 // If the module has no functions or globals, there's nothing to compile. Just
333 // return an empty string.
334 if (module->empty() && module->global_empty()) {
335 VLOG(2) << "Module '" << module->getName().str()
336 << "' is empty. Skipping compilation.";
337 return string();
338 }
339 // Link the input module with libdevice, to pull in implementations of some
340 // builtins.
341 TF_RETURN_IF_ERROR(
342 LinkLibdeviceIfNecessary(module, compute_capability, libdevice_dir_path));
343
344 // Set the flush-denormals-to-zero flag on the module so the NVVM reflect pass
345 // can access it.
346 module->addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz",
347 hlo_module_config.debug_options().xla_gpu_ftz());
348
349 // If ftz is enabled, set it as an attribute on every function in the module.
350 if (hlo_module_config.debug_options().xla_gpu_ftz()) {
351 for (llvm::Function& fn : *module) {
352 fn.addFnAttr("nvptx-f32ftz", "true");
353 }
354 }
355
356 IrDumpingPassManager module_passes(module->getModuleIdentifier(), "", false);
357
358 // Add an appropriate TargetLibraryInfo pass for the module's triple.
359 llvm::TargetLibraryInfoWrapperPass* tliwp =
360 new llvm::TargetLibraryInfoWrapperPass(
361 llvm::Triple(module->getTargetTriple()));
362 module_passes.add(tliwp);
363
364 // Try to fetch the target triple from the module. If not present, set a
365 // default target triple.
366 llvm::Triple target_triple = llvm::Triple(module->getTargetTriple());
367 if (target_triple.getArch() == llvm::Triple::UnknownArch) {
368 LOG(WARNING) << "target triple not found in the module";
369 target_triple = llvm::Triple("nvptx64-unknown-unknown");
370 }
371
372 // Figure out the exact name of the processor as known to the NVPTX backend
373 // from the gpu_architecture flag.
374 std::unique_ptr<llvm::TargetMachine> target_machine = GetTargetMachine(
375 target_triple, GetSmName(compute_capability), hlo_module_config);
376 module_passes.add(llvm::createTargetTransformInfoWrapperPass(
377 target_machine->getTargetIRAnalysis()));
378
379 // The LLVM IR verifier performs sanity checking on the IR. This helps
380 // discover problems and report them in a meaningful manner, rather than let
381 // later passes report obscure assertions because of unfulfilled invariants.
382 module_passes.add(llvm::createVerifierPass());
383
384 // Create the function-level pass manager. It needs data layout information
385 // too.
386 llvm::legacy::FunctionPassManager function_passes(module);
387
388 int32 opt_level =
389 hlo_module_config.debug_options().xla_backend_optimization_level();
390
391 if (opt_level < 2) {
392 LOG(ERROR) << std::string(80, '*');
393 LOG(ERROR) << "The XLA GPU backend doesn't support unoptimized code "
394 "generation but ";
395 LOG(ERROR) << "--xla_backend_optimization_level is set to " << opt_level
396 << "!";
397 LOG(ERROR) << "(Supported configuration is "
398 "--xla_backend_optimization_level >= 2.)";
399 LOG(ERROR) << std::string(80, '*');
400 }
401
402 AddOptimizationPasses(opt_level,
403 /*size_level=*/0, target_machine.get(), &module_passes,
404 &function_passes);
405
406 // Loop unrolling exposes more opportunities for SROA. Therefore, we run SROA
407 // again after the standard optimization passes [http://b/13329423].
408 // TODO(jingyue): SROA may further expose more optimization opportunities such
409 // as more precise alias analysis and more function inlining (SROA may change
410 // the inlining cost of a function). For now, running SROA already emits good
411 // enough code for the evaluated benchmarks. We may want to run more
412 // optimizations later.
413 if (opt_level > 0) {
414 // LLVM's optimizer turns on SROA when the optimization level is greater
415 // than 0. We mimic this behavior here.
416 module_passes.add(llvm::createSROAPass());
417 }
418
419 // Verify that the module is well formed after optimizations ran.
420 module_passes.add(llvm::createVerifierPass());
421
422 // Done populating the pass managers. Now run them.
423
424 function_passes.doInitialization();
425 for (auto func = module->begin(); func != module->end(); ++func) {
426 function_passes.run(*func);
427 }
428 function_passes.doFinalization();
429 module_passes.run(*module);
430
431 // Finally, produce PTX.
432 return EmitModuleToPTX(module, target_machine.get());
433 }
434
435 // One-time module initializer.
436 // Must be called only once -- DO NOT CALL DIRECTLY.
GPUBackendInit(const HloModuleConfig & hlo_module_config)437 void GPUBackendInit(const HloModuleConfig& hlo_module_config) {
438 // Feed all customized flags here, so we can override them with llvm_cl_opts
439 // without redeploy the compiler for development purpose.
440
441 // This flag tunes a threshold in branch folding. The default threshold, which
442 // is one, is not suitable for CUDA programs where branches are more expensive
443 // than for CPU programs. Setting the threshold to 2 improves the latency of
444 // TwoDPatchDotProductKernel_IND_3_ND_48 by over 5%, and does not affect the
445 // latency of other benchmarks so far.
446 //
447 // I also tried setting this threshold to other values:
448 // * 3-6 gives similar results as 2;
449 // * >6 start hurting the performance of at least dot product kernels.
450 //
451 // TODO(jingyue): The current threshold only considers the number of IR
452 // instructions which do not accurately reflect the true cost. We need a
453 // better cost model.
454 FeedLLVMWithFlags({"-bonus-inst-threshold=2"});
455 // Increase limit when scanning memory dependencies. This helps to reduce
456 // more redundant load instructions.
457 //
458 // The specific value is currently large enough for s3d in shoc benchmark,
459 // which contains a lot of load instructions and many arithmetic instructions
460 // between those loads.
461 FeedLLVMWithFlags({"-memdep-block-scan-limit=500"});
462
463 // Use div.approx -- it matters for some float-division heavy benchmarks.
464 FeedLLVMWithFlags({"-nvptx-prec-divf32=0"});
465
466 llvm_ir::InitializeLLVMCommandLineOptions(hlo_module_config);
467
468 // Initialize the NVPTX target; it's the only target we link with, so call its
469 // specific initialization functions instead of the catch-all InitializeAll*.
470 LLVMInitializeNVPTXTarget();
471 LLVMInitializeNVPTXTargetInfo();
472 LLVMInitializeNVPTXTargetMC();
473 LLVMInitializeNVPTXAsmPrinter();
474
475 // Initialize the LLVM optimization passes.
476 llvm::PassRegistry* registry = llvm::PassRegistry::getPassRegistry();
477 InitializePasses(registry);
478 }
479
480 } // namespace
481
CompileToPtx(llvm::Module * module,std::pair<int,int> compute_capability,const HloModuleConfig & hlo_module_config,const string & libdevice_dir_path)482 StatusOr<string> CompileToPtx(llvm::Module* module,
483 std::pair<int, int> compute_capability,
484 const HloModuleConfig& hlo_module_config,
485 const string& libdevice_dir_path) {
486 static std::once_flag backend_init_flag;
487 std::call_once(backend_init_flag, GPUBackendInit, hlo_module_config);
488
489 string ptx;
490 {
491 tensorflow::tracing::ScopedActivity activity("Compiling IR",
492 module->getName().str(),
493 /*is_expensive=*/true);
494 XLA_SCOPED_LOGGING_TIMER("Compile module " + module->getName().str());
495 TF_ASSIGN_OR_RETURN(
496 ptx, CompileModuleToPtx(module, compute_capability, hlo_module_config,
497 libdevice_dir_path));
498 }
499 return ptx;
500 }
501
502 } // namespace gpu
503 } // namespace xla
504