1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/stream_executor/gpu/asm_compiler.h"
17
18 #include <string>
19 #include <utility>
20
21 #include "absl/container/flat_hash_map.h"
22 #include "absl/container/flat_hash_set.h"
23 #include "absl/strings/str_format.h"
24 #include "absl/strings/string_view.h"
25 #include "absl/synchronization/mutex.h"
26 #include "tensorflow/core/lib/core/errors.h"
27 #include "tensorflow/core/lib/gtl/cleanup.h"
28 #include "tensorflow/core/lib/io/path.h"
29 #include "tensorflow/core/platform/cuda_libdevice_path.h"
30 #include "tensorflow/core/platform/env.h"
31 #include "tensorflow/core/platform/regexp.h"
32 #include "tensorflow/core/platform/subprocess.h"
33 #include "tensorflow/stream_executor/gpu/gpu_driver.h"
34 #include "tensorflow/stream_executor/lib/statusor.h"
35
36 namespace stream_executor {
37
GetPtxasVersionString(const std::string & binary_path)38 static port::StatusOr<absl::string_view> GetPtxasVersionString(
39 const std::string& binary_path) {
40 static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED);
41 static auto* seen_binary_paths TF_GUARDED_BY(mu) =
42 new absl::flat_hash_map<std::string, std::string>();
43
44 tensorflow::mutex_lock lock(mu);
45 auto it = seen_binary_paths->find(binary_path);
46 if (it != seen_binary_paths->end()) {
47 // Already checked this binary, nothing to do.
48 return absl::string_view(it->second);
49 }
50
51 tensorflow::SubProcess binary;
52 binary.SetProgram(binary_path, {binary_path, "--version"});
53 binary.SetChannelAction(tensorflow::CHAN_STDOUT, tensorflow::ACTION_PIPE);
54 if (!binary.Start()) {
55 return port::InternalError(
56 absl::StrFormat("Couldn't invoke %s --version", binary_path));
57 }
58
59 std::string out;
60 int exit_code = binary.Communicate(/*stdin_input=*/nullptr, &out,
61 /*stderr_output=*/nullptr);
62 if (exit_code != 0) {
63 return port::InternalError(absl::StrFormat(
64 "Running %s --version returned %d", binary_path, exit_code));
65 }
66 auto emplace_it = seen_binary_paths->emplace(binary_path, std::move(out));
67 return absl::string_view(emplace_it.first->second);
68 }
69
70 // Prints a warning if the ptxas at ptxas_path has known bugs.
71 //
72 // Only prints a warning the first time it's called for a particular value of
73 // ptxas_path.
74 //
75 // Locks on entry.
WarnIfBadPtxasVersion(const std::string & ptxas_path)76 static void WarnIfBadPtxasVersion(const std::string& ptxas_path) {
77 port::StatusOr<absl::string_view> ptxas_version =
78 GetPtxasVersionString(ptxas_path);
79 if (!ptxas_version.ok()) {
80 LOG(WARNING) << "Couldn't get ptxas version string: "
81 << ptxas_version.status();
82 return;
83 }
84
85 int64_t vmaj, vmin, vdot;
86 std::string vmaj_str, vmin_str, vdot_str;
87 if (!RE2::PartialMatch(ptxas_version.ValueOrDie(),
88 R"(\bV(\d+)\.(\d+)\.(\d+)\b)", &vmaj_str, &vmin_str,
89 &vdot_str) ||
90 !absl::SimpleAtoi(vmaj_str, &vmaj) ||
91 !absl::SimpleAtoi(vmin_str, &vmin) ||
92 !absl::SimpleAtoi(vdot_str, &vdot)) {
93 LOG(WARNING) << "Couldn't parse ptxas version in output of " << ptxas_path
94 << " --version:\n"
95 << ptxas_version.ValueOrDie();
96 return;
97 }
98
99 // We need ptxas >= 9.0 as a hard requirement, because we compile targeting
100 // PTX 6.0. An older ptxas will just fail to compile any of our code.
101 //
102 // ptxas 9.0 before 9.0.276 and ptxas 9.1 before 9.1.121 miscompile some
103 // address calculations with large offsets (e.g. "load ptr + large_constant"),
104 // b/70245379.
105 //
106 // ptxas 9.1.121 miscompiles some large multioutput fusions, again in a way
107 // that appears related to address calculations, b/111107644. ptxas 9.2.88
108 // appears to work, as far as we can tell.
109 if (vmaj < 9) {
110 LOG(ERROR)
111 << "You are using ptxas 8.x, but TF requires ptxas 9.x (and strongly "
112 "prefers >= 9.2.88). Compilation of XLA kernels below will likely "
113 "fail.\n\nYou do not need to update CUDA; cherry-picking the ptxas "
114 "binary is sufficient.";
115 } else if (std::make_tuple(vmaj, vmin, vdot) < std::make_tuple(9, 2, 88)) {
116 LOG(WARNING)
117 << "*** WARNING *** You are using ptxas " << vmaj << "." << vmin << "."
118 << vdot
119 << ", which is older than 9.2.88. ptxas 9.x before 9.2.88 is known to "
120 "miscompile XLA code, leading to incorrect results or "
121 "invalid-address errors.\n\nYou do not need to update to CUDA "
122 "9.2.88; cherry-picking the ptxas binary is sufficient.";
123 }
124 }
125
CompileGpuAsmOrGetCached(int device_ordinal,const char * ptx,GpuAsmOpts compilation_options)126 port::StatusOr<absl::Span<const uint8>> CompileGpuAsmOrGetCached(
127 int device_ordinal, const char* ptx, GpuAsmOpts compilation_options) {
128 using PtxCacheKey = std::tuple<int, std::string, GpuAsmOpts::PtxOptionsTuple>;
129 using PtxCompilerResult = port::StatusOr<std::vector<uint8>>;
130 static tensorflow::mutex ptx_cache_mutex(tensorflow::LINKER_INITIALIZED);
131 static auto& ptx_cache TF_GUARDED_BY(ptx_cache_mutex) =
132 *new absl::flat_hash_map<PtxCacheKey, PtxCompilerResult>();
133
134 tensorflow::mutex_lock lock(ptx_cache_mutex);
135 PtxCacheKey cache_key{device_ordinal, std::string(ptx),
136 compilation_options.ToTuple()};
137 auto it = ptx_cache.find(cache_key);
138 if (it == ptx_cache.end()) {
139 PtxCompilerResult compiled =
140 CompileGpuAsm(device_ordinal, ptx, compilation_options);
141 it = ptx_cache.emplace(cache_key, std::move(compiled)).first;
142 }
143
144 CHECK(it != ptx_cache.end());
145
146 // Failed compilation attempts are cached.
147 // Use separate status check and ValueOrDie invocation on ptx_cache
148 // entry to avoid value moving introduced by TF_ASSIGN_OR_RETURN.
149
150 if (TF_PREDICT_FALSE(!it->second.ok())) {
151 return it->second.status();
152 }
153
154 const std::vector<uint8>& compiled = it->second.ValueOrDie();
155 return absl::MakeSpan(compiled);
156 }
157
CompileGpuAsm(int device_ordinal,const char * ptx_contents,GpuAsmOpts options)158 port::StatusOr<std::vector<uint8>> CompileGpuAsm(int device_ordinal,
159 const char* ptx_contents,
160 GpuAsmOpts options) {
161 gpu::GpuDeviceHandle handle;
162 TF_RETURN_IF_ERROR(gpu::GpuDriver::GetDevice(device_ordinal, &handle));
163 int cc_major;
164 int cc_minor;
165 TF_RETURN_IF_ERROR(
166 gpu::GpuDriver::GetComputeCapability(&cc_major, &cc_minor, handle));
167 return CompileGpuAsm(cc_major, cc_minor, ptx_contents, options);
168 }
169
FindCudaExecutable(const std::string binary_name,const std::string preferred_cuda_dir)170 static std::string FindCudaExecutable(const std::string binary_name,
171 const std::string preferred_cuda_dir) {
172 static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED);
173 static auto* seen_binary_paths TF_GUARDED_BY(mu) =
174 new absl::flat_hash_map<std::pair<std::string, std::string>,
175 std::string>();
176
177 #if defined(PLATFORM_WINDOWS)
178 const std::string binary_filename = binary_name + ".exe";
179 #else
180 const std::string& binary_filename = binary_name;
181 #endif
182
183 auto cache_key = std::make_pair(binary_name, preferred_cuda_dir);
184
185 tensorflow::mutex_lock lock(mu);
186 auto it = seen_binary_paths->find(cache_key);
187 if (it != seen_binary_paths->end()) {
188 return it->second;
189 }
190
191 // Try searching in the default PATH first if applicable.
192 if (tensorflow::PreferPtxasFromPath() &&
193 GetPtxasVersionString(binary_filename).ok()) {
194 VLOG(2) << "Using " << binary_filename;
195 seen_binary_paths->emplace(std::move(cache_key), binary_filename);
196 return binary_filename;
197 }
198
199 // Search in cuda root candidates.
200 auto env = tensorflow::Env::Default();
201 std::string binary_path;
202 for (const std::string& cuda_root :
203 tensorflow::CandidateCudaRoots(preferred_cuda_dir)) {
204 binary_path = tensorflow::io::JoinPath(cuda_root, "bin", binary_filename);
205 VLOG(2) << "Looking for " << binary_filename << " at " << binary_path;
206 if (env->FileExists(binary_path).ok() &&
207 GetPtxasVersionString(binary_path).ok()) {
208 break;
209 }
210 }
211 if (!env->FileExists(binary_path).ok()) {
212 // Give up and just rely on subprocess invocation to find the correct
213 // binary. This won't work, in all probability, given we already tried that
214 // above, but it's the best we can do.
215 VLOG(2) << "Unable to find " << binary_name;
216 binary_path = binary_filename;
217 }
218 VLOG(2) << "Using " << binary_filename << " at " << binary_path;
219 seen_binary_paths->emplace(std::move(cache_key), binary_path);
220 return binary_path;
221 }
222
LogPtxasTooOld(const std::string & ptxas_path,int cc_major,int cc_minor)223 static void LogPtxasTooOld(const std::string& ptxas_path, int cc_major,
224 int cc_minor) {
225 using AlreadyLoggedSetTy =
226 absl::flat_hash_set<std::tuple<std::string, int, int>>;
227
228 static absl::Mutex* mutex = new absl::Mutex;
229 static AlreadyLoggedSetTy* already_logged = new AlreadyLoggedSetTy;
230
231 absl::MutexLock lock(mutex);
232
233 if (already_logged->insert(std::make_tuple(ptxas_path, cc_major, cc_minor))
234 .second) {
235 LOG(WARNING) << "Falling back to the CUDA driver for PTX compilation; "
236 "ptxas does not support CC "
237 << cc_major << "." << cc_minor;
238 LOG(WARNING) << "Used ptxas at " << ptxas_path;
239 }
240 }
241
AppendArgsFromOptions(GpuAsmOpts options,std::vector<std::string> & args)242 static void AppendArgsFromOptions(GpuAsmOpts options,
243 std::vector<std::string>& args) {
244 if (options.disable_gpuasm_optimizations) {
245 args.push_back("-O0");
246 }
247 args.insert(args.end(), options.extra_flags.begin(),
248 options.extra_flags.end());
249 }
250
CompileGpuAsm(int cc_major,int cc_minor,const char * ptx_contents,GpuAsmOpts options)251 port::StatusOr<std::vector<uint8>> CompileGpuAsm(int cc_major, int cc_minor,
252 const char* ptx_contents,
253 GpuAsmOpts options) {
254 std::string ptxas_path =
255 FindCudaExecutable("ptxas", options.preferred_cuda_dir);
256
257 WarnIfBadPtxasVersion(ptxas_path);
258
259 // Write ptx into a temporary file.
260 std::string ptx_path;
261 auto env = tensorflow::Env::Default();
262 if (!env->LocalTempFilename(&ptx_path)) {
263 return port::InternalError("couldn't get temp PTX file name");
264 }
265 TF_RETURN_IF_ERROR(
266 tensorflow::WriteStringToFile(env, ptx_path, ptx_contents));
267 VLOG(2) << "ptx written to: " << ptx_path;
268
269 auto ptx_cleaner = tensorflow::gtl::MakeCleanup([&ptx_path] {
270 TF_CHECK_OK(tensorflow::Env::Default()->DeleteFile(ptx_path));
271 });
272
273 // Invoke ptxas and collect its output.
274 std::string cubin_path;
275 if (!env->LocalTempFilename(&cubin_path)) {
276 return port::InternalError("couldn't get temp CUBIN file name");
277 }
278 auto cubin_cleaner = tensorflow::gtl::MakeCleanup([&cubin_path] {
279 // CUBIN file may never be created, so the failure to delete it should not
280 // produce TF error.
281 tensorflow::Env::Default()->DeleteFile(cubin_path).IgnoreError();
282 });
283 tensorflow::SubProcess ptxas_info_dumper;
284 std::vector<std::string> ptxas_args = {
285 ptxas_path,
286 ptx_path,
287 "-o",
288 cubin_path,
289 absl::StrCat("-arch=sm_", cc_major, cc_minor),
290 "--warn-on-spills"};
291 if (VLOG_IS_ON(2)) {
292 ptxas_args.push_back("-v");
293 }
294 AppendArgsFromOptions(options, ptxas_args);
295 if (VLOG_IS_ON(3)) {
296 VLOG(3) << absl::StrJoin(ptxas_args, " ");
297 }
298
299 ptxas_info_dumper.SetProgram(ptxas_path, ptxas_args);
300 ptxas_info_dumper.SetChannelAction(tensorflow::CHAN_STDERR,
301 tensorflow::ACTION_PIPE);
302 if (!ptxas_info_dumper.Start()) {
303 return port::InternalError("Failed to launch ptxas");
304 }
305 std::string stderr_output;
306 int exit_status = ptxas_info_dumper.Communicate(
307 /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
308 if (exit_status != 0) {
309 // It happens when the ptxas installed is too old for the current GPU.
310 // Example error message associated with this error code:
311 // ptxas fatal : Value 'sm_80' is not defined for option 'gpu-name'
312 // In that case, fallback to the driver for compilation
313 if (absl::StartsWith(stderr_output, "ptxas fatal : Value '") &&
314 absl::StrContains(stderr_output,
315 "is not defined for option 'gpu-name'")) {
316 LogPtxasTooOld(ptxas_path, cc_major, cc_minor);
317 return tensorflow::errors::Unimplemented(
318 ptxas_path, " ptxas too old. Falling back to the driver to compile.");
319 }
320
321 return port::InternalError(
322 absl::StrFormat("ptxas exited with non-zero error code %d, output: %s",
323 exit_status, stderr_output));
324 }
325 // Print the verbose output of ptxas.
326 if (!stderr_output.empty()) {
327 if (absl::StrContains(stderr_output, "warning")) {
328 LOG(INFO) << stderr_output;
329 } else {
330 VLOG(2) << stderr_output;
331 }
332 }
333
334 // Read in the result of compilation and return it as a byte vector.
335 std::string cubin;
336 TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(tensorflow::Env::Default(),
337 cubin_path, &cubin));
338 std::vector<uint8> cubin_vector(cubin.begin(), cubin.end());
339 return cubin_vector;
340 }
341
BundleGpuAsm(std::vector<CubinOrPTXImage> images,GpuAsmOpts options)342 port::StatusOr<std::vector<uint8>> BundleGpuAsm(
343 std::vector<CubinOrPTXImage> images, GpuAsmOpts options) {
344 std::string fatbinary_path =
345 FindCudaExecutable("fatbinary", options.preferred_cuda_dir);
346
347 // Write images to temporary files.
348 std::vector<std::string> image_paths;
349 auto env = tensorflow::Env::Default();
350 for (const CubinOrPTXImage& img : images) {
351 std::string img_path;
352 if (!env->LocalTempFilename(&img_path)) {
353 return port::InternalError(
354 "Could not get temporary filenames for images.");
355 }
356 TF_RETURN_IF_ERROR(tensorflow::WriteStringToFile(
357 env, img_path, std::string(img.bytes.begin(), img.bytes.end())));
358 VLOG(2) << "image written to " << img_path;
359 image_paths.push_back(std::move(img_path));
360 }
361 auto image_files_cleaner = tensorflow::gtl::MakeCleanup([&image_paths] {
362 for (const auto& path : image_paths) {
363 TF_CHECK_OK(tensorflow::Env::Default()->DeleteFile(path));
364 }
365 });
366
367 // Prepare temorary result file.
368 std::string result_path;
369 if (!env->LocalTempFilename(&result_path)) {
370 return port::InternalError(
371 "Could not get temporary filename for fatbin result.");
372 }
373 auto result_file_cleaner = tensorflow::gtl::MakeCleanup([&result_path] {
374 // This file may never be created, so the failure to delete it should not
375 // propagate to TF.
376 tensorflow::Env::Default()->DeleteFile(result_path).IgnoreError();
377 });
378
379 // Compute the ptxas options that were used to produce the cubins.
380 std::vector<std::string> ptxas_options;
381 AppendArgsFromOptions(options, ptxas_options);
382
383 // Invoke fatbinary and collect its output.
384 tensorflow::SubProcess fatbinary;
385 std::vector<std::string> fatbinary_args = {
386 fatbinary_path, "--64", "--link", "--compress-all",
387 absl::StrCat("--create=", result_path)};
388 if (!ptxas_options.empty()) {
389 auto command_line = absl::StrJoin(ptxas_options, " ");
390 fatbinary_args.push_back(absl::StrFormat("--cmdline=%s", command_line));
391 }
392 assert(images.size() == image_paths.size());
393 for (int i = 0; i < images.size(); i++) {
394 fatbinary_args.push_back(absl::StrFormat(
395 "--image=profile=%s,file=%s", images[i].profile, image_paths[i]));
396 }
397 if (VLOG_IS_ON(3)) {
398 VLOG(3) << absl::StrJoin(fatbinary_args, " ");
399 }
400 fatbinary.SetProgram(fatbinary_path, fatbinary_args);
401 fatbinary.SetChannelAction(tensorflow::CHAN_STDERR, tensorflow::ACTION_PIPE);
402 if (!fatbinary.Start()) {
403 return port::InternalError("Failed to launch fatbinary.");
404 }
405 std::string stderr_output;
406 int exit_status = fatbinary.Communicate(
407 /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
408 if (exit_status != 0) {
409 return port::InternalError(absl::StrFormat(
410 "fatbinary exited with non-zero error code %d, output: %s", exit_status,
411 stderr_output));
412 }
413 if (!stderr_output.empty()) {
414 VLOG(2) << stderr_output;
415 }
416
417 // Read in the result and return it as a byte vector.
418 std::string result_blob;
419 TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(tensorflow::Env::Default(),
420 result_path, &result_blob));
421 return std::vector<uint8>(result_blob.begin(), result_blob.end());
422 }
423
findRocmExecutable(const std::string & binary_relative_path,const std::string & rocm_root_dir)424 static std::string findRocmExecutable(const std::string& binary_relative_path,
425 const std::string& rocm_root_dir) {
426 auto env = tensorflow::Env::Default();
427 std::string binary_path =
428 tensorflow::io::JoinPath(rocm_root_dir, binary_relative_path);
429 VLOG(2) << "Looking for " << binary_relative_path << " at " << rocm_root_dir;
430 if (!env->FileExists(binary_path).ok()) {
431 binary_path = absl::StrCat("<", binary_path, " - NOT FOUND>");
432 }
433 return binary_path;
434 }
435
BundleGpuAsm(std::vector<HsacoImage> images,const std::string rocm_root_dir)436 port::StatusOr<std::vector<uint8>> BundleGpuAsm(
437 std::vector<HsacoImage> images, const std::string rocm_root_dir) {
438 std::string clang_offload_bundler_path =
439 findRocmExecutable("llvm/bin/clang-offload-bundler", rocm_root_dir);
440
441 // Initialise the "--inputs" / "--targets" arguments for the
442 // clang-offload-bundler with a dummy file / host target triple...
443 // clang-offload-bundler requires 1 and only 1 host target triple
444 std::ostringstream inputs_list;
445 std::ostringstream targets_list;
446
447 inputs_list << "/dev/null";
448 targets_list << "host-x86_64-unknown-linux";
449
450 // Write images to temporary files.
451 std::vector<std::string> image_paths;
452 auto env = tensorflow::Env::Default();
453 for (const HsacoImage& img : images) {
454 std::string img_path;
455 if (!env->LocalTempFilename(&img_path)) {
456 return port::InternalError(
457 "Could not get temporary filenames for images.");
458 }
459 TF_RETURN_IF_ERROR(tensorflow::WriteStringToFile(
460 env, img_path, std::string(img.bytes.begin(), img.bytes.end())));
461 VLOG(2) << "image written to " << img_path;
462 inputs_list << "," << img_path;
463 targets_list << ",hip-amdgcn-amd-amdhsa-" << img.gfx_arch;
464 image_paths.push_back(std::move(img_path));
465 }
466 auto image_files_cleaner = tensorflow::gtl::MakeCleanup([&image_paths] {
467 for (const auto& path : image_paths) {
468 TF_CHECK_OK(tensorflow::Env::Default()->DeleteFile(path));
469 }
470 });
471
472 // Prepare temorary result file.
473 std::string result_path;
474 if (!env->LocalTempFilename(&result_path)) {
475 return port::InternalError(
476 "Could not get temporary filename for fatbin result.");
477 }
478 auto result_file_cleaner = tensorflow::gtl::MakeCleanup([&result_path] {
479 // This file may never be created, so the failure to delete it should not
480 // propagate to TF.
481 tensorflow::Env::Default()->DeleteFile(result_path).IgnoreError();
482 });
483
484 // Invoke clang_offload_bundler and collect its output.
485 tensorflow::SubProcess clang_offload_bundler;
486 std::vector<std::string> clang_offload_bundler_args = {
487 clang_offload_bundler_path, absl::StrCat("--inputs=", inputs_list.str()),
488 absl::StrCat("--targets=", targets_list.str()), "--type=o",
489 absl::StrCat("--outputs=", result_path)};
490 if (VLOG_IS_ON(3)) {
491 VLOG(3) << absl::StrJoin(clang_offload_bundler_args, " ");
492 }
493 clang_offload_bundler.SetProgram(clang_offload_bundler_path,
494 clang_offload_bundler_args);
495 clang_offload_bundler.SetChannelAction(tensorflow::CHAN_STDERR,
496 tensorflow::ACTION_PIPE);
497 if (!clang_offload_bundler.Start()) {
498 return port::InternalError("Failed to launch clang_offload_bundler.");
499 }
500 std::string stderr_output;
501 int exit_status = clang_offload_bundler.Communicate(
502 /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
503 if (exit_status != 0) {
504 return port::InternalError(absl::StrFormat(
505 "clang_offload_bundler exited with non-zero error code %d, output: %s",
506 exit_status, stderr_output));
507 }
508 if (!stderr_output.empty()) {
509 VLOG(2) << stderr_output;
510 }
511
512 // Read in the result and return it as a byte vector.
513 std::string result_blob;
514 TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(tensorflow::Env::Default(),
515 result_path, &result_blob));
516 return std::vector<uint8>(result_blob.begin(), result_blob.end());
517 }
518
519 } // namespace stream_executor
520