• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/stream_executor/gpu/asm_compiler.h"
17 
18 #include "absl/container/flat_hash_map.h"
19 #include "absl/strings/str_format.h"
20 #include "absl/synchronization/mutex.h"
21 #include "tensorflow/core/lib/core/errors.h"
22 #include "tensorflow/core/lib/gtl/cleanup.h"
23 #include "tensorflow/core/lib/io/path.h"
24 #include "tensorflow/core/platform/cuda_libdevice_path.h"
25 #include "tensorflow/core/platform/env.h"
26 #include "tensorflow/core/platform/mutex.h"
27 #include "tensorflow/core/platform/regexp.h"
28 #include "tensorflow/core/platform/subprocess.h"
29 #include "tensorflow/stream_executor/gpu/gpu_driver.h"
30 #include "tensorflow/stream_executor/lib/statusor.h"
31 
32 namespace stream_executor {
33 
34 #if TENSORFLOW_USE_ROCM || defined(PLATFORM_WINDOWS)
35 
CompileGpuAsm(int device_ordinal,const char * ptx_contents,GpuAsmOpts options)36 port::StatusOr<std::vector<uint8>> CompileGpuAsm(int device_ordinal,
37                                                  const char* ptx_contents,
38                                                  GpuAsmOpts options) {
39   // TODO(b/134675935): Subprocess invocation not supported on Windows.
40   return port::InternalError(
41       "Invoking GPU asm compilation is supported on Cuda non-Windows "
42       "platforms only");
43 }
44 
CompileGpuAsmOrGetCached(int device_ordinal,const char * ptx,GpuAsmOpts compilation_options)45 port::StatusOr<absl::Span<const uint8>> CompileGpuAsmOrGetCached(
46     int device_ordinal, const char* ptx, GpuAsmOpts compilation_options) {
47   return CompileGpuAsm(device_ordinal, ptx, compilation_options);
48 }
49 
50 #else
51 
52 // Prints a warning if the ptxas at ptxas_path has known bugs.
53 //
54 // Only prints a warning the first time it's called for a particular value of
55 // ptxas_path.
56 //
57 // Locks on entry.
58 static void WarnIfBadPtxasVersion(const string& ptxas_path) {
59   static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED);
60   static std::unordered_set<string>* seen_ptxas_paths GUARDED_BY(mu) =
61       new std::unordered_set<string>();
62 
63   tensorflow::mutex_lock lock(mu);
64   if (!seen_ptxas_paths->insert(ptxas_path).second) {
65     // Already checked this ptx binary, nothing to do.
66     return;
67   }
68 
69   tensorflow::SubProcess ptxas;
70   ptxas.SetProgram(ptxas_path, {ptxas_path, "--version"});
71   ptxas.SetChannelAction(tensorflow::CHAN_STDOUT, tensorflow::ACTION_PIPE);
72   if (!ptxas.Start()) {
73     LOG(WARNING) << "Couldn't invoke " << ptxas_path << " --version";
74     return;
75   }
76 
77   string out;
78   int exit_code = ptxas.Communicate(/*stdin_input=*/nullptr, &out,
79                                     /*stderr_output=*/nullptr);
80   if (exit_code != 0) {
81     LOG(WARNING) << "Running " << ptxas_path << " --version returned "
82                  << exit_code;
83     return;
84   }
85 
86   int64 vmaj, vmin, vdot;
87   string vmaj_str, vmin_str, vdot_str;
88   if (!RE2::PartialMatch(out, R"(\bV(\d+)\.(\d+)\.(\d+)\b)", &vmaj_str,
89                          &vmin_str, &vdot_str) ||
90       !absl::SimpleAtoi(vmaj_str, &vmaj) ||
91       !absl::SimpleAtoi(vmin_str, &vmin) ||
92       !absl::SimpleAtoi(vdot_str, &vdot)) {
93     LOG(WARNING) << "Couldn't parse ptxas version in output of " << ptxas_path
94                  << " --version:\n"
95                  << out;
96     return;
97   }
98 
99   // We need ptxas >= 9.0 as a hard requirement, because we compile targeting
100   // PTX 6.0.  An older ptxas will just fail to compile any of our code.
101   //
102   // ptxas 9.0 before 9.0.276 and ptxas 9.1 before 9.1.121 miscompile some
103   // address calculations with large offsets (e.g. "load ptr + large_constant"),
104   // b/70245379.
105   //
106   // ptxas 9.1.121 miscompiles some large multioutput fusions, again in a way
107   // that appears related to address calculations, b/111107644.  ptxas 9.2.88
108   // appears to work, as far as we can tell.
109   if (vmaj < 9) {
110     LOG(ERROR)
111         << "You are using ptxas 8.x, but TF requires ptxas 9.x (and strongly "
112            "prefers >= 9.2.88).  Compilation of XLA kernels below will likely "
113            "fail.\n\nYou do not need to update CUDA; cherry-picking the ptxas "
114            "binary is sufficient.";
115   } else if (std::make_tuple(vmaj, vmin, vdot) < std::make_tuple(9, 2, 88)) {
116     LOG(WARNING)
117         << "*** WARNING *** You are using ptxas " << vmaj << "." << vmin << "."
118         << vdot
119         << ", which is older than 9.2.88. ptxas 9.x before 9.2.88 is known to "
120            "miscompile XLA code, leading to incorrect results or "
121            "invalid-address errors.\n\nYou do not need to update to CUDA "
122            "9.2.88; cherry-picking the ptxas binary is sufficient.";
123   }
124 }
125 
126 port::StatusOr<absl::Span<const uint8>> CompileGpuAsmOrGetCached(
127     int device_ordinal, const char* ptx, GpuAsmOpts compilation_options) {
128   using PtxCacheKey = std::tuple<int, std::string, GpuAsmOpts::PtxOptionsTuple>;
129   static tensorflow::mutex ptx_cache_mutex(tensorflow::LINKER_INITIALIZED);
130   static auto& ptx_cache GUARDED_BY(ptx_cache_mutex) =
131       *new absl::flat_hash_map<PtxCacheKey, std::vector<uint8>>();
132 
133   tensorflow::mutex_lock lock(ptx_cache_mutex);
134   PtxCacheKey cache_key{device_ordinal, std::string(ptx),
135                         compilation_options.ToTuple()};
136   auto it = ptx_cache.find(cache_key);
137   if (it == ptx_cache.end()) {
138     TF_ASSIGN_OR_RETURN(
139         std::vector<uint8> compiled,
140         CompileGpuAsm(device_ordinal, ptx, compilation_options));
141     it = ptx_cache.emplace(cache_key, std::move(compiled)).first;
142   }
143 
144   CHECK(it != ptx_cache.end());
145   const std::vector<uint8>& compiled = it->second;
146   return absl::MakeSpan(compiled);
147 }
148 
149 port::StatusOr<std::vector<uint8>> CompileGpuAsm(int device_ordinal,
150                                                  const char* ptx_contents,
151                                                  GpuAsmOpts options) {
152   gpu::GpuDeviceHandle handle;
153   TF_RETURN_IF_ERROR(gpu::GpuDriver::GetDevice(device_ordinal, &handle));
154   int cc_major;
155   int cc_minor;
156   TF_RETURN_IF_ERROR(
157       gpu::GpuDriver::GetComputeCapability(&cc_major, &cc_minor, handle));
158 
159   string ptxas_path;
160   auto env = tensorflow::Env::Default();
161   for (const string& cuda_root :
162        tensorflow::CandidateCudaRoots(options.preferred_cuda_dir)) {
163     ptxas_path = tensorflow::io::JoinPath(cuda_root, "bin", "ptxas");
164     VLOG(2) << "Looking for ptxas at " << ptxas_path;
165     if (env->FileExists(ptxas_path).ok()) {
166       break;
167     }
168   }
169   if (!env->FileExists(ptxas_path).ok()) {
170     // Rely on subprocess invocation to find the correct binary.
171     ptxas_path = "ptxas";
172   }
173   VLOG(2) << "Using ptxas at " << ptxas_path;
174 
175   WarnIfBadPtxasVersion(ptxas_path);
176 
177   // Write ptx into a temporary file.
178   string ptx_path;
179   if (!env->LocalTempFilename(&ptx_path)) {
180     return port::InternalError("couldn't get temp PTX file name");
181   }
182   auto ptx_cleaner = tensorflow::gtl::MakeCleanup([&ptx_path] {
183     TF_CHECK_OK(tensorflow::Env::Default()->DeleteFile(ptx_path));
184   });
185 
186   TF_RETURN_IF_ERROR(
187       tensorflow::WriteStringToFile(env, ptx_path, ptx_contents));
188   VLOG(2) << "ptx written to: " << ptx_path;
189 
190   // Invoke ptxas and collect its output.
191   string cubin_path;
192   if (!env->LocalTempFilename(&cubin_path)) {
193     return port::InternalError("couldn't get temp CUBIN file name");
194   }
195   auto cubin_cleaner = tensorflow::gtl::MakeCleanup([&cubin_path] {
196     // CUBIN file may never be created, so the failure to delete it should not
197     // produce TF error.
198     tensorflow::Env::Default()->DeleteFile(cubin_path).IgnoreError();
199   });
200   tensorflow::SubProcess ptxas_info_dumper;
201   std::vector<string> ptxas_args = {
202       ptxas_path, ptx_path, "-o", cubin_path,
203       absl::StrCat("-arch=sm_", cc_major, cc_minor)};
204   if (VLOG_IS_ON(2)) {
205     ptxas_args.push_back("-v");
206   }
207   if (options.disable_gpuasm_optimizations) {
208     ptxas_args.push_back("-O0");
209   }
210   ptxas_info_dumper.SetProgram(ptxas_path, ptxas_args);
211   ptxas_info_dumper.SetChannelAction(tensorflow::CHAN_STDERR,
212                                      tensorflow::ACTION_PIPE);
213   if (!ptxas_info_dumper.Start()) {
214     return port::InternalError("Failed to launch ptxas");
215   }
216   string stderr_output;
217   int exit_status = ptxas_info_dumper.Communicate(
218       /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
219   if (exit_status != 0) {
220     return port::InternalError(
221         absl::StrFormat("ptxas exited with non-zero error code %d, output: %s",
222                         exit_status, stderr_output));
223   }
224 
225   // Read in the result of compilation and return it as a byte vector.
226   string cubin;
227   TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(tensorflow::Env::Default(),
228                                                   cubin_path, &cubin));
229   std::vector<uint8> cubin_vector(cubin.begin(), cubin.end());
230   return cubin_vector;
231 }
232 
233 #endif
234 
235 }  // namespace stream_executor
236