• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/stream_executor/gpu/asm_compiler.h"
17 
18 #include <string>
19 #include <utility>
20 
21 #include "absl/container/flat_hash_map.h"
22 #include "absl/container/flat_hash_set.h"
23 #include "absl/strings/str_format.h"
24 #include "absl/strings/string_view.h"
25 #include "absl/synchronization/mutex.h"
26 #include "tensorflow/core/lib/core/errors.h"
27 #include "tensorflow/core/lib/gtl/cleanup.h"
28 #include "tensorflow/core/lib/io/path.h"
29 #include "tensorflow/core/platform/cuda_libdevice_path.h"
30 #include "tensorflow/core/platform/env.h"
31 #include "tensorflow/core/platform/regexp.h"
32 #include "tensorflow/core/platform/subprocess.h"
33 #include "tensorflow/stream_executor/gpu/gpu_driver.h"
34 #include "tensorflow/stream_executor/lib/statusor.h"
35 
36 namespace stream_executor {
37 
GetPtxasVersionString(const std::string & binary_path)38 static port::StatusOr<absl::string_view> GetPtxasVersionString(
39     const std::string& binary_path) {
40   static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED);
41   static auto* seen_binary_paths TF_GUARDED_BY(mu) =
42       new absl::flat_hash_map<std::string, std::string>();
43 
44   tensorflow::mutex_lock lock(mu);
45   auto it = seen_binary_paths->find(binary_path);
46   if (it != seen_binary_paths->end()) {
47     // Already checked this binary, nothing to do.
48     return absl::string_view(it->second);
49   }
50 
51   tensorflow::SubProcess binary;
52   binary.SetProgram(binary_path, {binary_path, "--version"});
53   binary.SetChannelAction(tensorflow::CHAN_STDOUT, tensorflow::ACTION_PIPE);
54   if (!binary.Start()) {
55     return port::InternalError(
56         absl::StrFormat("Couldn't invoke %s --version", binary_path));
57   }
58 
59   std::string out;
60   int exit_code = binary.Communicate(/*stdin_input=*/nullptr, &out,
61                                      /*stderr_output=*/nullptr);
62   if (exit_code != 0) {
63     return port::InternalError(absl::StrFormat(
64         "Running %s --version returned %d", binary_path, exit_code));
65   }
66   auto emplace_it = seen_binary_paths->emplace(binary_path, std::move(out));
67   return absl::string_view(emplace_it.first->second);
68 }
69 
70 // Prints a warning if the ptxas at ptxas_path has known bugs.
71 //
72 // Only prints a warning the first time it's called for a particular value of
73 // ptxas_path.
74 //
75 // Locks on entry.
WarnIfBadPtxasVersion(const std::string & ptxas_path)76 static void WarnIfBadPtxasVersion(const std::string& ptxas_path) {
77   port::StatusOr<absl::string_view> ptxas_version =
78       GetPtxasVersionString(ptxas_path);
79   if (!ptxas_version.ok()) {
80     LOG(WARNING) << "Couldn't get ptxas version string: "
81                  << ptxas_version.status();
82     return;
83   }
84 
85   int64_t vmaj, vmin, vdot;
86   std::string vmaj_str, vmin_str, vdot_str;
87   if (!RE2::PartialMatch(ptxas_version.ValueOrDie(),
88                          R"(\bV(\d+)\.(\d+)\.(\d+)\b)", &vmaj_str, &vmin_str,
89                          &vdot_str) ||
90       !absl::SimpleAtoi(vmaj_str, &vmaj) ||
91       !absl::SimpleAtoi(vmin_str, &vmin) ||
92       !absl::SimpleAtoi(vdot_str, &vdot)) {
93     LOG(WARNING) << "Couldn't parse ptxas version in output of " << ptxas_path
94                  << " --version:\n"
95                  << ptxas_version.ValueOrDie();
96     return;
97   }
98 
99   // We need ptxas >= 9.0 as a hard requirement, because we compile targeting
100   // PTX 6.0.  An older ptxas will just fail to compile any of our code.
101   //
102   // ptxas 9.0 before 9.0.276 and ptxas 9.1 before 9.1.121 miscompile some
103   // address calculations with large offsets (e.g. "load ptr + large_constant"),
104   // b/70245379.
105   //
106   // ptxas 9.1.121 miscompiles some large multioutput fusions, again in a way
107   // that appears related to address calculations, b/111107644.  ptxas 9.2.88
108   // appears to work, as far as we can tell.
109   if (vmaj < 9) {
110     LOG(ERROR)
111         << "You are using ptxas 8.x, but TF requires ptxas 9.x (and strongly "
112            "prefers >= 9.2.88).  Compilation of XLA kernels below will likely "
113            "fail.\n\nYou do not need to update CUDA; cherry-picking the ptxas "
114            "binary is sufficient.";
115   } else if (std::make_tuple(vmaj, vmin, vdot) < std::make_tuple(9, 2, 88)) {
116     LOG(WARNING)
117         << "*** WARNING *** You are using ptxas " << vmaj << "." << vmin << "."
118         << vdot
119         << ", which is older than 9.2.88. ptxas 9.x before 9.2.88 is known to "
120            "miscompile XLA code, leading to incorrect results or "
121            "invalid-address errors.\n\nYou do not need to update to CUDA "
122            "9.2.88; cherry-picking the ptxas binary is sufficient.";
123   }
124 }
125 
CompileGpuAsmOrGetCached(int device_ordinal,const char * ptx,GpuAsmOpts compilation_options)126 port::StatusOr<absl::Span<const uint8>> CompileGpuAsmOrGetCached(
127     int device_ordinal, const char* ptx, GpuAsmOpts compilation_options) {
128   using PtxCacheKey = std::tuple<int, std::string, GpuAsmOpts::PtxOptionsTuple>;
129   using PtxCompilerResult = port::StatusOr<std::vector<uint8>>;
130   static tensorflow::mutex ptx_cache_mutex(tensorflow::LINKER_INITIALIZED);
131   static auto& ptx_cache TF_GUARDED_BY(ptx_cache_mutex) =
132       *new absl::flat_hash_map<PtxCacheKey, PtxCompilerResult>();
133 
134   tensorflow::mutex_lock lock(ptx_cache_mutex);
135   PtxCacheKey cache_key{device_ordinal, std::string(ptx),
136                         compilation_options.ToTuple()};
137   auto it = ptx_cache.find(cache_key);
138   if (it == ptx_cache.end()) {
139     PtxCompilerResult compiled =
140         CompileGpuAsm(device_ordinal, ptx, compilation_options);
141     it = ptx_cache.emplace(cache_key, std::move(compiled)).first;
142   }
143 
144   CHECK(it != ptx_cache.end());
145 
146   // Failed compilation attempts are cached.
147   // Use separate status check and ValueOrDie invocation on ptx_cache
148   // entry to avoid value moving introduced by TF_ASSIGN_OR_RETURN.
149 
150   if (TF_PREDICT_FALSE(!it->second.ok())) {
151     return it->second.status();
152   }
153 
154   const std::vector<uint8>& compiled = it->second.ValueOrDie();
155   return absl::MakeSpan(compiled);
156 }
157 
CompileGpuAsm(int device_ordinal,const char * ptx_contents,GpuAsmOpts options)158 port::StatusOr<std::vector<uint8>> CompileGpuAsm(int device_ordinal,
159                                                  const char* ptx_contents,
160                                                  GpuAsmOpts options) {
161   gpu::GpuDeviceHandle handle;
162   TF_RETURN_IF_ERROR(gpu::GpuDriver::GetDevice(device_ordinal, &handle));
163   int cc_major;
164   int cc_minor;
165   TF_RETURN_IF_ERROR(
166       gpu::GpuDriver::GetComputeCapability(&cc_major, &cc_minor, handle));
167   return CompileGpuAsm(cc_major, cc_minor, ptx_contents, options);
168 }
169 
FindCudaExecutable(const std::string binary_name,const std::string preferred_cuda_dir)170 static std::string FindCudaExecutable(const std::string binary_name,
171                                       const std::string preferred_cuda_dir) {
172   static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED);
173   static auto* seen_binary_paths TF_GUARDED_BY(mu) =
174       new absl::flat_hash_map<std::pair<std::string, std::string>,
175                               std::string>();
176 
177 #if defined(PLATFORM_WINDOWS)
178   const std::string binary_filename = binary_name + ".exe";
179 #else
180   const std::string& binary_filename = binary_name;
181 #endif
182 
183   auto cache_key = std::make_pair(binary_name, preferred_cuda_dir);
184 
185   tensorflow::mutex_lock lock(mu);
186   auto it = seen_binary_paths->find(cache_key);
187   if (it != seen_binary_paths->end()) {
188     return it->second;
189   }
190 
191   // Try searching in the default PATH first if applicable.
192   if (tensorflow::PreferPtxasFromPath() &&
193       GetPtxasVersionString(binary_filename).ok()) {
194     VLOG(2) << "Using " << binary_filename;
195     seen_binary_paths->emplace(std::move(cache_key), binary_filename);
196     return binary_filename;
197   }
198 
199   // Search in cuda root candidates.
200   auto env = tensorflow::Env::Default();
201   std::string binary_path;
202   for (const std::string& cuda_root :
203        tensorflow::CandidateCudaRoots(preferred_cuda_dir)) {
204     binary_path = tensorflow::io::JoinPath(cuda_root, "bin", binary_filename);
205     VLOG(2) << "Looking for " << binary_filename << " at " << binary_path;
206     if (env->FileExists(binary_path).ok() &&
207         GetPtxasVersionString(binary_path).ok()) {
208       break;
209     }
210   }
211   if (!env->FileExists(binary_path).ok()) {
212     // Give up and just rely on subprocess invocation to find the correct
213     // binary. This won't work, in all probability, given we already tried that
214     // above, but it's the best we can do.
215     VLOG(2) << "Unable to find " << binary_name;
216     binary_path = binary_filename;
217   }
218   VLOG(2) << "Using " << binary_filename << " at " << binary_path;
219   seen_binary_paths->emplace(std::move(cache_key), binary_path);
220   return binary_path;
221 }
222 
LogPtxasTooOld(const std::string & ptxas_path,int cc_major,int cc_minor)223 static void LogPtxasTooOld(const std::string& ptxas_path, int cc_major,
224                            int cc_minor) {
225   using AlreadyLoggedSetTy =
226       absl::flat_hash_set<std::tuple<std::string, int, int>>;
227 
228   static absl::Mutex* mutex = new absl::Mutex;
229   static AlreadyLoggedSetTy* already_logged = new AlreadyLoggedSetTy;
230 
231   absl::MutexLock lock(mutex);
232 
233   if (already_logged->insert(std::make_tuple(ptxas_path, cc_major, cc_minor))
234           .second) {
235     LOG(WARNING) << "Falling back to the CUDA driver for PTX compilation; "
236                     "ptxas does not support CC "
237                  << cc_major << "." << cc_minor;
238     LOG(WARNING) << "Used ptxas at " << ptxas_path;
239   }
240 }
241 
AppendArgsFromOptions(GpuAsmOpts options,std::vector<std::string> & args)242 static void AppendArgsFromOptions(GpuAsmOpts options,
243                                   std::vector<std::string>& args) {
244   if (options.disable_gpuasm_optimizations) {
245     args.push_back("-O0");
246   }
247   args.insert(args.end(), options.extra_flags.begin(),
248               options.extra_flags.end());
249 }
250 
CompileGpuAsm(int cc_major,int cc_minor,const char * ptx_contents,GpuAsmOpts options)251 port::StatusOr<std::vector<uint8>> CompileGpuAsm(int cc_major, int cc_minor,
252                                                  const char* ptx_contents,
253                                                  GpuAsmOpts options) {
254   std::string ptxas_path =
255       FindCudaExecutable("ptxas", options.preferred_cuda_dir);
256 
257   WarnIfBadPtxasVersion(ptxas_path);
258 
259   // Write ptx into a temporary file.
260   std::string ptx_path;
261   auto env = tensorflow::Env::Default();
262   if (!env->LocalTempFilename(&ptx_path)) {
263     return port::InternalError("couldn't get temp PTX file name");
264   }
265   TF_RETURN_IF_ERROR(
266       tensorflow::WriteStringToFile(env, ptx_path, ptx_contents));
267   VLOG(2) << "ptx written to: " << ptx_path;
268 
269   auto ptx_cleaner = tensorflow::gtl::MakeCleanup([&ptx_path] {
270     TF_CHECK_OK(tensorflow::Env::Default()->DeleteFile(ptx_path));
271   });
272 
273   // Invoke ptxas and collect its output.
274   std::string cubin_path;
275   if (!env->LocalTempFilename(&cubin_path)) {
276     return port::InternalError("couldn't get temp CUBIN file name");
277   }
278   auto cubin_cleaner = tensorflow::gtl::MakeCleanup([&cubin_path] {
279     // CUBIN file may never be created, so the failure to delete it should not
280     // produce TF error.
281     tensorflow::Env::Default()->DeleteFile(cubin_path).IgnoreError();
282   });
283   tensorflow::SubProcess ptxas_info_dumper;
284   std::vector<std::string> ptxas_args = {
285       ptxas_path,
286       ptx_path,
287       "-o",
288       cubin_path,
289       absl::StrCat("-arch=sm_", cc_major, cc_minor),
290       "--warn-on-spills"};
291   if (VLOG_IS_ON(2)) {
292     ptxas_args.push_back("-v");
293   }
294   AppendArgsFromOptions(options, ptxas_args);
295   if (VLOG_IS_ON(3)) {
296     VLOG(3) << absl::StrJoin(ptxas_args, " ");
297   }
298 
299   ptxas_info_dumper.SetProgram(ptxas_path, ptxas_args);
300   ptxas_info_dumper.SetChannelAction(tensorflow::CHAN_STDERR,
301                                      tensorflow::ACTION_PIPE);
302   if (!ptxas_info_dumper.Start()) {
303     return port::InternalError("Failed to launch ptxas");
304   }
305   std::string stderr_output;
306   int exit_status = ptxas_info_dumper.Communicate(
307       /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
308   if (exit_status != 0) {
309     //  It happens when the ptxas installed is too old for the current GPU.
310     //  Example error message associated with this error code:
311     //      ptxas fatal   : Value 'sm_80' is not defined for option 'gpu-name'
312     // In that case, fallback to the driver for compilation
313     if (absl::StartsWith(stderr_output, "ptxas fatal   : Value '") &&
314         absl::StrContains(stderr_output,
315                           "is not defined for option 'gpu-name'")) {
316       LogPtxasTooOld(ptxas_path, cc_major, cc_minor);
317       return tensorflow::errors::Unimplemented(
318           ptxas_path, " ptxas too old. Falling back to the driver to compile.");
319     }
320 
321     return port::InternalError(
322         absl::StrFormat("ptxas exited with non-zero error code %d, output: %s",
323                         exit_status, stderr_output));
324   }
325   // Print the verbose output of ptxas.
326   if (!stderr_output.empty()) {
327     if (absl::StrContains(stderr_output, "warning")) {
328       LOG(INFO) << stderr_output;
329     } else {
330       VLOG(2) << stderr_output;
331     }
332   }
333 
334   // Read in the result of compilation and return it as a byte vector.
335   std::string cubin;
336   TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(tensorflow::Env::Default(),
337                                                   cubin_path, &cubin));
338   std::vector<uint8> cubin_vector(cubin.begin(), cubin.end());
339   return cubin_vector;
340 }
341 
BundleGpuAsm(std::vector<CubinOrPTXImage> images,GpuAsmOpts options)342 port::StatusOr<std::vector<uint8>> BundleGpuAsm(
343     std::vector<CubinOrPTXImage> images, GpuAsmOpts options) {
344   std::string fatbinary_path =
345       FindCudaExecutable("fatbinary", options.preferred_cuda_dir);
346 
347   // Write images to temporary files.
348   std::vector<std::string> image_paths;
349   auto env = tensorflow::Env::Default();
350   for (const CubinOrPTXImage& img : images) {
351     std::string img_path;
352     if (!env->LocalTempFilename(&img_path)) {
353       return port::InternalError(
354           "Could not get temporary filenames for images.");
355     }
356     TF_RETURN_IF_ERROR(tensorflow::WriteStringToFile(
357         env, img_path, std::string(img.bytes.begin(), img.bytes.end())));
358     VLOG(2) << "image written to " << img_path;
359     image_paths.push_back(std::move(img_path));
360   }
361   auto image_files_cleaner = tensorflow::gtl::MakeCleanup([&image_paths] {
362     for (const auto& path : image_paths) {
363       TF_CHECK_OK(tensorflow::Env::Default()->DeleteFile(path));
364     }
365   });
366 
367   // Prepare temorary result file.
368   std::string result_path;
369   if (!env->LocalTempFilename(&result_path)) {
370     return port::InternalError(
371         "Could not get temporary filename for fatbin result.");
372   }
373   auto result_file_cleaner = tensorflow::gtl::MakeCleanup([&result_path] {
374     // This file may never be created, so the failure to delete it should not
375     // propagate to TF.
376     tensorflow::Env::Default()->DeleteFile(result_path).IgnoreError();
377   });
378 
379   // Compute the ptxas options that were used to produce the cubins.
380   std::vector<std::string> ptxas_options;
381   AppendArgsFromOptions(options, ptxas_options);
382 
383   // Invoke fatbinary and collect its output.
384   tensorflow::SubProcess fatbinary;
385   std::vector<std::string> fatbinary_args = {
386       fatbinary_path, "--64", "--link", "--compress-all",
387       absl::StrCat("--create=", result_path)};
388   if (!ptxas_options.empty()) {
389     auto command_line = absl::StrJoin(ptxas_options, " ");
390     fatbinary_args.push_back(absl::StrFormat("--cmdline=%s", command_line));
391   }
392   assert(images.size() == image_paths.size());
393   for (int i = 0; i < images.size(); i++) {
394     fatbinary_args.push_back(absl::StrFormat(
395         "--image=profile=%s,file=%s", images[i].profile, image_paths[i]));
396   }
397   if (VLOG_IS_ON(3)) {
398     VLOG(3) << absl::StrJoin(fatbinary_args, " ");
399   }
400   fatbinary.SetProgram(fatbinary_path, fatbinary_args);
401   fatbinary.SetChannelAction(tensorflow::CHAN_STDERR, tensorflow::ACTION_PIPE);
402   if (!fatbinary.Start()) {
403     return port::InternalError("Failed to launch fatbinary.");
404   }
405   std::string stderr_output;
406   int exit_status = fatbinary.Communicate(
407       /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
408   if (exit_status != 0) {
409     return port::InternalError(absl::StrFormat(
410         "fatbinary exited with non-zero error code %d, output: %s", exit_status,
411         stderr_output));
412   }
413   if (!stderr_output.empty()) {
414     VLOG(2) << stderr_output;
415   }
416 
417   // Read in the result and return it as a byte vector.
418   std::string result_blob;
419   TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(tensorflow::Env::Default(),
420                                                   result_path, &result_blob));
421   return std::vector<uint8>(result_blob.begin(), result_blob.end());
422 }
423 
findRocmExecutable(const std::string & binary_relative_path,const std::string & rocm_root_dir)424 static std::string findRocmExecutable(const std::string& binary_relative_path,
425                                       const std::string& rocm_root_dir) {
426   auto env = tensorflow::Env::Default();
427   std::string binary_path =
428       tensorflow::io::JoinPath(rocm_root_dir, binary_relative_path);
429   VLOG(2) << "Looking for " << binary_relative_path << " at " << rocm_root_dir;
430   if (!env->FileExists(binary_path).ok()) {
431     binary_path = absl::StrCat("<", binary_path, " - NOT FOUND>");
432   }
433   return binary_path;
434 }
435 
BundleGpuAsm(std::vector<HsacoImage> images,const std::string rocm_root_dir)436 port::StatusOr<std::vector<uint8>> BundleGpuAsm(
437     std::vector<HsacoImage> images, const std::string rocm_root_dir) {
438   std::string clang_offload_bundler_path =
439       findRocmExecutable("llvm/bin/clang-offload-bundler", rocm_root_dir);
440 
441   // Initialise the "--inputs" / "--targets" arguments for the
442   // clang-offload-bundler with a dummy file / host target triple...
443   // clang-offload-bundler requires 1 and only 1 host target triple
444   std::ostringstream inputs_list;
445   std::ostringstream targets_list;
446 
447   inputs_list << "/dev/null";
448   targets_list << "host-x86_64-unknown-linux";
449 
450   // Write images to temporary files.
451   std::vector<std::string> image_paths;
452   auto env = tensorflow::Env::Default();
453   for (const HsacoImage& img : images) {
454     std::string img_path;
455     if (!env->LocalTempFilename(&img_path)) {
456       return port::InternalError(
457           "Could not get temporary filenames for images.");
458     }
459     TF_RETURN_IF_ERROR(tensorflow::WriteStringToFile(
460         env, img_path, std::string(img.bytes.begin(), img.bytes.end())));
461     VLOG(2) << "image written to " << img_path;
462     inputs_list << "," << img_path;
463     targets_list << ",hip-amdgcn-amd-amdhsa-" << img.gfx_arch;
464     image_paths.push_back(std::move(img_path));
465   }
466   auto image_files_cleaner = tensorflow::gtl::MakeCleanup([&image_paths] {
467     for (const auto& path : image_paths) {
468       TF_CHECK_OK(tensorflow::Env::Default()->DeleteFile(path));
469     }
470   });
471 
472   // Prepare temorary result file.
473   std::string result_path;
474   if (!env->LocalTempFilename(&result_path)) {
475     return port::InternalError(
476         "Could not get temporary filename for fatbin result.");
477   }
478   auto result_file_cleaner = tensorflow::gtl::MakeCleanup([&result_path] {
479     // This file may never be created, so the failure to delete it should not
480     // propagate to TF.
481     tensorflow::Env::Default()->DeleteFile(result_path).IgnoreError();
482   });
483 
484   // Invoke clang_offload_bundler and collect its output.
485   tensorflow::SubProcess clang_offload_bundler;
486   std::vector<std::string> clang_offload_bundler_args = {
487       clang_offload_bundler_path, absl::StrCat("--inputs=", inputs_list.str()),
488       absl::StrCat("--targets=", targets_list.str()), "--type=o",
489       absl::StrCat("--outputs=", result_path)};
490   if (VLOG_IS_ON(3)) {
491     VLOG(3) << absl::StrJoin(clang_offload_bundler_args, " ");
492   }
493   clang_offload_bundler.SetProgram(clang_offload_bundler_path,
494                                    clang_offload_bundler_args);
495   clang_offload_bundler.SetChannelAction(tensorflow::CHAN_STDERR,
496                                          tensorflow::ACTION_PIPE);
497   if (!clang_offload_bundler.Start()) {
498     return port::InternalError("Failed to launch clang_offload_bundler.");
499   }
500   std::string stderr_output;
501   int exit_status = clang_offload_bundler.Communicate(
502       /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
503   if (exit_status != 0) {
504     return port::InternalError(absl::StrFormat(
505         "clang_offload_bundler exited with non-zero error code %d, output: %s",
506         exit_status, stderr_output));
507   }
508   if (!stderr_output.empty()) {
509     VLOG(2) << stderr_output;
510   }
511 
512   // Read in the result and return it as a byte vector.
513   std::string result_blob;
514   TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(tensorflow::Env::Default(),
515                                                   result_path, &result_blob));
516   return std::vector<uint8>(result_blob.begin(), result_blob.end());
517 }
518 
519 }  // namespace stream_executor
520