1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_NVPTX_COMPILER_H_ 17 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_NVPTX_COMPILER_H_ 18 19 #include <memory> 20 #include <string> 21 #include <vector> 22 23 #include "absl/container/node_hash_map.h" 24 #include "absl/types/optional.h" 25 #include "tensorflow/compiler/xla/service/gpu/gpu_compiler.h" 26 #include "tensorflow/compiler/xla/statusor.h" 27 #include "tensorflow/core/lib/hash/hash.h" 28 #include "tensorflow/core/platform/mutex.h" 29 30 namespace xla { 31 namespace gpu { 32 33 void WarnIfBadDriverJITVersion(); 34 35 // NVPTXCompiler generates efficient GPU executables for NVPTX target. 36 class NVPTXCompiler : public GpuCompiler { 37 public: 38 NVPTXCompiler(); ~NVPTXCompiler()39 ~NVPTXCompiler() override {} 40 41 Status OptimizeHloConvolutionCanonicalization( 42 HloModule* hlo_module, se::StreamExecutor* stream_exec, 43 se::DeviceMemoryAllocator* device_allocator) override; 44 45 Status OptimizeHloPostLayoutAssignment( 46 HloModule* hlo_module, se::StreamExecutor* stream_exec, 47 se::DeviceMemoryAllocator* device_allocator) override; 48 49 HloDataflowAnalysis::CanShareBuffer GetCanShareBuffer() override; 50 51 GpuVersion GetGpuVersion(se::StreamExecutor* stream_exec) override; 52 53 StatusOr<std::pair<std::string, std::vector<uint8>>> CompileTargetBinary( 54 const HloModuleConfig& module_config, llvm::Module* llvm_module, 55 GpuVersion gpu_version, se::StreamExecutor* stream_exec, bool relocatable, 56 const HloModule* debug_module) override; 57 58 private: 59 StatusOr<std::vector<uint8>> LinkModules( 60 se::StreamExecutor* stream_exec, 61 std::vector<std::vector<uint8>> modules) override; 62 63 tensorflow::mutex mutex_; 64 65 // When compiling an HLO module, we need to find a path to the nvvm libdevice 66 // files. We search in the module's config.debug_options().cuda_data_dir() 67 // and in tensorflow::LibdeviceRoot(), the latter of which is a constant. 68 // 69 // We cache the cuda_data_dir() and the result of our search, so that if the 70 // next module we have to compile has the same cuda_data_dir(), we can skip 71 // the search. 72 string cached_cuda_data_dir_ TF_GUARDED_BY(mutex_); 73 string cached_libdevice_dir_ TF_GUARDED_BY(mutex_); 74 75 // Tries to compile the given ptx string to cubin. Returns a vector with the 76 // compiled cubin. If compilation was unsuccessful, returns an empty vector. 77 std::vector<uint8> CompileGpuAsmOrGetCachedResult( 78 se::StreamExecutor* stream_exec, const string& ptx, int cc_major, 79 int cc_minor, const HloModuleConfig& hlo_module_config, bool relocatable); 80 81 // The compilation_cache_ map is a cache from {ptx string, cc_major, cc_minor} 82 // -> cubin so we don't recompile the same ptx twice. This is important for 83 // some interactive workflows. (We also cache at the HLO level, but sometimes 84 // we can't realize that two modules are the same until we lower to ptx.) 85 // 86 // Compilation of distinct PTX happens in parallel. If more than one thread 87 // attempts to compile the same PTX, the fist thread to obtain 88 // cache_value_->mutex_ performs the compilation. The rest wait() on 89 // cache_value_->compilation_done_cv_ until the compilation is done. 90 // 91 // If compiling the ptx fails, we return an empty cubin, cross our fingers, 92 // and leave compilation up to the driver. 93 struct CompilationCacheKey { CompilationCacheKeyCompilationCacheKey94 CompilationCacheKey(std::string ptx, int cc_major, int cc_minor, 95 bool relocatable) 96 : ptx(std::move(ptx)), 97 cc_major(cc_major), 98 cc_minor(cc_minor), 99 relocatable(relocatable) {} 100 string ptx; 101 int cc_major; 102 int cc_minor; 103 bool relocatable; 104 }; 105 struct CompilationCacheHash { operatorCompilationCacheHash106 size_t operator()(const CompilationCacheKey& key) const { 107 return tensorflow::Hash64Combine( 108 tensorflow::Hash64Combine( 109 tensorflow::Hash64Combine(tensorflow::Hash64(key.ptx), 110 key.cc_major), 111 key.cc_minor), 112 key.relocatable); 113 } 114 }; 115 struct CompilationCacheEq { operatorCompilationCacheEq116 size_t operator()(const CompilationCacheKey& a, 117 const CompilationCacheKey& b) const { 118 return a.cc_major == b.cc_major && a.cc_minor == b.cc_minor && 119 a.ptx == b.ptx && a.relocatable == b.relocatable; 120 } 121 }; 122 struct CompilationCacheValue { 123 bool compilation_done = false; 124 std::vector<uint8> cubin_data; 125 // mutex and condition variable to serialize compilation completing. 126 tensorflow::mutex mutex_; 127 tensorflow::condition_variable compilation_done_cv_; 128 }; 129 130 // Don't even think about switching this to flat_hash_map; iterator stability 131 // is critical here. 132 absl::node_hash_map<CompilationCacheKey, CompilationCacheValue, 133 CompilationCacheHash, CompilationCacheEq> 134 compilation_cache_ TF_GUARDED_BY(mutex_); 135 136 TF_DISALLOW_COPY_AND_ASSIGN(NVPTXCompiler); 137 }; 138 139 } // namespace gpu 140 } // namespace xla 141 142 #endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_NVPTX_COMPILER_H_ 143