1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/stream_executor/gpu/asm_compiler.h"
17
18 #include "absl/container/flat_hash_map.h"
19 #include "absl/container/flat_hash_set.h"
20 #include "absl/strings/str_format.h"
21 #include "absl/synchronization/mutex.h"
22 #include "tensorflow/core/lib/core/errors.h"
23 #include "tensorflow/core/lib/gtl/cleanup.h"
24 #include "tensorflow/core/lib/io/path.h"
25 #include "tensorflow/core/platform/cuda_libdevice_path.h"
26 #include "tensorflow/core/platform/env.h"
27 #include "tensorflow/core/platform/mutex.h"
28 #include "tensorflow/core/platform/regexp.h"
29 #include "tensorflow/core/platform/subprocess.h"
30 #include "tensorflow/stream_executor/gpu/gpu_driver.h"
31 #include "tensorflow/stream_executor/lib/statusor.h"
32
33 namespace stream_executor {
34
35 // Prints a warning if the ptxas at ptxas_path has known bugs.
36 //
37 // Only prints a warning the first time it's called for a particular value of
38 // ptxas_path.
39 //
40 // Locks on entry.
WarnIfBadPtxasVersion(const std::string & ptxas_path)41 static void WarnIfBadPtxasVersion(const std::string& ptxas_path) {
42 static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED);
43 static std::unordered_set<std::string>* seen_ptxas_paths TF_GUARDED_BY(mu) =
44 new std::unordered_set<std::string>();
45
46 tensorflow::mutex_lock lock(mu);
47 if (!seen_ptxas_paths->insert(ptxas_path).second) {
48 // Already checked this ptx binary, nothing to do.
49 return;
50 }
51
52 tensorflow::SubProcess ptxas;
53 ptxas.SetProgram(ptxas_path, {ptxas_path, "--version"});
54 ptxas.SetChannelAction(tensorflow::CHAN_STDOUT, tensorflow::ACTION_PIPE);
55 if (!ptxas.Start()) {
56 LOG(WARNING) << "Couldn't invoke " << ptxas_path << " --version";
57 return;
58 }
59
60 std::string out;
61 int exit_code = ptxas.Communicate(/*stdin_input=*/nullptr, &out,
62 /*stderr_output=*/nullptr);
63 if (exit_code != 0) {
64 LOG(WARNING) << "Running " << ptxas_path << " --version returned "
65 << exit_code;
66 return;
67 }
68
69 int64 vmaj, vmin, vdot;
70 std::string vmaj_str, vmin_str, vdot_str;
71 if (!RE2::PartialMatch(out, R"(\bV(\d+)\.(\d+)\.(\d+)\b)", &vmaj_str,
72 &vmin_str, &vdot_str) ||
73 !absl::SimpleAtoi(vmaj_str, &vmaj) ||
74 !absl::SimpleAtoi(vmin_str, &vmin) ||
75 !absl::SimpleAtoi(vdot_str, &vdot)) {
76 LOG(WARNING) << "Couldn't parse ptxas version in output of " << ptxas_path
77 << " --version:\n"
78 << out;
79 return;
80 }
81
82 // We need ptxas >= 9.0 as a hard requirement, because we compile targeting
83 // PTX 6.0. An older ptxas will just fail to compile any of our code.
84 //
85 // ptxas 9.0 before 9.0.276 and ptxas 9.1 before 9.1.121 miscompile some
86 // address calculations with large offsets (e.g. "load ptr + large_constant"),
87 // b/70245379.
88 //
89 // ptxas 9.1.121 miscompiles some large multioutput fusions, again in a way
90 // that appears related to address calculations, b/111107644. ptxas 9.2.88
91 // appears to work, as far as we can tell.
92 if (vmaj < 9) {
93 LOG(ERROR)
94 << "You are using ptxas 8.x, but TF requires ptxas 9.x (and strongly "
95 "prefers >= 9.2.88). Compilation of XLA kernels below will likely "
96 "fail.\n\nYou do not need to update CUDA; cherry-picking the ptxas "
97 "binary is sufficient.";
98 } else if (std::make_tuple(vmaj, vmin, vdot) < std::make_tuple(9, 2, 88)) {
99 LOG(WARNING)
100 << "*** WARNING *** You are using ptxas " << vmaj << "." << vmin << "."
101 << vdot
102 << ", which is older than 9.2.88. ptxas 9.x before 9.2.88 is known to "
103 "miscompile XLA code, leading to incorrect results or "
104 "invalid-address errors.\n\nYou do not need to update to CUDA "
105 "9.2.88; cherry-picking the ptxas binary is sufficient.";
106 }
107 }
108
CompileGpuAsmOrGetCached(int device_ordinal,const char * ptx,GpuAsmOpts compilation_options)109 port::StatusOr<absl::Span<const uint8>> CompileGpuAsmOrGetCached(
110 int device_ordinal, const char* ptx, GpuAsmOpts compilation_options) {
111 using PtxCacheKey = std::tuple<int, std::string, GpuAsmOpts::PtxOptionsTuple>;
112 using PtxCompilerResult = port::StatusOr<std::vector<uint8>>;
113 static tensorflow::mutex ptx_cache_mutex(tensorflow::LINKER_INITIALIZED);
114 static auto& ptx_cache TF_GUARDED_BY(ptx_cache_mutex) =
115 *new absl::flat_hash_map<PtxCacheKey, PtxCompilerResult>();
116
117 tensorflow::mutex_lock lock(ptx_cache_mutex);
118 PtxCacheKey cache_key{device_ordinal, std::string(ptx),
119 compilation_options.ToTuple()};
120 auto it = ptx_cache.find(cache_key);
121 if (it == ptx_cache.end()) {
122 PtxCompilerResult compiled =
123 CompileGpuAsm(device_ordinal, ptx, compilation_options);
124 it = ptx_cache.emplace(cache_key, std::move(compiled)).first;
125 }
126
127 CHECK(it != ptx_cache.end());
128
129 // Failed compilation attempts are cached.
130 // Use separate status check and ValueOrDie invocation on ptx_cache
131 // entry to avoid value moving introduced by TF_ASSIGN_OR_RETURN.
132
133 if (TF_PREDICT_FALSE(!it->second.ok())) {
134 return it->second.status();
135 }
136
137 const std::vector<uint8>& compiled = it->second.ValueOrDie();
138 return absl::MakeSpan(compiled);
139 }
140
CompileGpuAsm(int device_ordinal,const char * ptx_contents,GpuAsmOpts options)141 port::StatusOr<std::vector<uint8>> CompileGpuAsm(int device_ordinal,
142 const char* ptx_contents,
143 GpuAsmOpts options) {
144 gpu::GpuDeviceHandle handle;
145 TF_RETURN_IF_ERROR(gpu::GpuDriver::GetDevice(device_ordinal, &handle));
146 int cc_major;
147 int cc_minor;
148 TF_RETURN_IF_ERROR(
149 gpu::GpuDriver::GetComputeCapability(&cc_major, &cc_minor, handle));
150 return CompileGpuAsm(cc_major, cc_minor, ptx_contents, options);
151 }
152
findCudaExecutable(const std::string binary_name,const std::string preferred_cuda_dir)153 static std::string findCudaExecutable(const std::string binary_name,
154 const std::string preferred_cuda_dir) {
155 #if defined(PLATFORM_WINDOWS)
156 const std::string binary_filename = binary_name + ".exe";
157 #else
158 const std::string& binary_filename = binary_name;
159 #endif
160
161 // Search in cuda root candidates.
162 auto env = tensorflow::Env::Default();
163 std::string binary_path;
164 for (const std::string& cuda_root :
165 tensorflow::CandidateCudaRoots(preferred_cuda_dir)) {
166 binary_path = tensorflow::io::JoinPath(cuda_root, "bin", binary_filename);
167 VLOG(2) << "Looking for " << binary_filename << " at " << binary_path;
168 if (env->FileExists(binary_path).ok()) {
169 break;
170 }
171 }
172 if (!env->FileExists(binary_path).ok()) {
173 // Rely on subprocess invocation to find the correct binary.
174 binary_path = binary_filename;
175 }
176 VLOG(2) << "Using " << binary_filename << " at " << binary_path;
177 return binary_path;
178 }
179
LogPtxasTooOld(const std::string & ptxas_path,int cc_major,int cc_minor)180 static void LogPtxasTooOld(const std::string& ptxas_path, int cc_major,
181 int cc_minor) {
182 using AlreadyLoggedSetTy =
183 absl::flat_hash_set<std::tuple<std::string, int, int>>;
184
185 static absl::Mutex* mutex = new absl::Mutex;
186 static AlreadyLoggedSetTy* already_logged = new AlreadyLoggedSetTy;
187
188 absl::MutexLock lock(mutex);
189
190 if (already_logged->insert({ptxas_path, cc_major, cc_minor}).second) {
191 LOG(WARNING) << "Falling back to the CUDA driver for PTX compilation; "
192 "ptxas does not support CC "
193 << cc_major << "." << cc_minor;
194 LOG(WARNING) << "Used ptxas at " << ptxas_path;
195 }
196 }
197
CompileGpuAsm(int cc_major,int cc_minor,const char * ptx_contents,GpuAsmOpts options)198 port::StatusOr<std::vector<uint8>> CompileGpuAsm(int cc_major, int cc_minor,
199 const char* ptx_contents,
200 GpuAsmOpts options) {
201 std::string ptxas_path =
202 findCudaExecutable("ptxas", options.preferred_cuda_dir);
203
204 WarnIfBadPtxasVersion(ptxas_path);
205
206 // Write ptx into a temporary file.
207 std::string ptx_path;
208 auto env = tensorflow::Env::Default();
209 if (!env->LocalTempFilename(&ptx_path)) {
210 return port::InternalError("couldn't get temp PTX file name");
211 }
212 TF_RETURN_IF_ERROR(
213 tensorflow::WriteStringToFile(env, ptx_path, ptx_contents));
214 VLOG(2) << "ptx written to: " << ptx_path;
215
216 auto ptx_cleaner = tensorflow::gtl::MakeCleanup([&ptx_path] {
217 TF_CHECK_OK(tensorflow::Env::Default()->DeleteFile(ptx_path));
218 });
219
220 // Invoke ptxas and collect its output.
221 std::string cubin_path;
222 if (!env->LocalTempFilename(&cubin_path)) {
223 return port::InternalError("couldn't get temp CUBIN file name");
224 }
225 auto cubin_cleaner = tensorflow::gtl::MakeCleanup([&cubin_path] {
226 // CUBIN file may never be created, so the failure to delete it should not
227 // produce TF error.
228 tensorflow::Env::Default()->DeleteFile(cubin_path).IgnoreError();
229 });
230 tensorflow::SubProcess ptxas_info_dumper;
231 std::vector<std::string> ptxas_args = {
232 ptxas_path, ptx_path, "-o", cubin_path,
233 absl::StrCat("-arch=sm_", cc_major, cc_minor)};
234 if (VLOG_IS_ON(2)) {
235 ptxas_args.push_back("-v");
236 }
237 if (options.disable_gpuasm_optimizations) {
238 ptxas_args.push_back("-O0");
239 }
240 ptxas_args.insert(ptxas_args.end(), options.extra_flags.begin(),
241 options.extra_flags.end());
242 if (VLOG_IS_ON(3)) {
243 VLOG(3) << absl::StrJoin(ptxas_args, " ");
244 }
245
246 ptxas_info_dumper.SetProgram(ptxas_path, ptxas_args);
247 ptxas_info_dumper.SetChannelAction(tensorflow::CHAN_STDERR,
248 tensorflow::ACTION_PIPE);
249 if (!ptxas_info_dumper.Start()) {
250 return port::InternalError("Failed to launch ptxas");
251 }
252 std::string stderr_output;
253 int exit_status = ptxas_info_dumper.Communicate(
254 /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
255 if (exit_status != 0) {
256 // It happens when the ptxas installed is too old for the current GPU.
257 // Example error message associated with this error code:
258 // ptxas fatal : Value 'sm_80' is not defined for option 'gpu-name'
259 // In that case, fallback to the driver for compilation
260 if (absl::StartsWith(stderr_output, "ptxas fatal : Value '") &&
261 absl::StrContains(stderr_output,
262 "is not defined for option 'gpu-name'")) {
263 LogPtxasTooOld(ptxas_path, cc_major, cc_minor);
264 return tensorflow::errors::Unimplemented(
265 ptxas_path, " ptxas too old. Falling back to the driver to compile.");
266 }
267
268 return port::InternalError(
269 absl::StrFormat("ptxas exited with non-zero error code %d, output: %s",
270 exit_status, stderr_output));
271 }
272 // Print the verbose output of ptxas.
273 if (!stderr_output.empty()) {
274 VLOG(2) << stderr_output;
275 }
276
277 // Read in the result of compilation and return it as a byte vector.
278 std::string cubin;
279 TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(tensorflow::Env::Default(),
280 cubin_path, &cubin));
281 std::vector<uint8> cubin_vector(cubin.begin(), cubin.end());
282 return cubin_vector;
283 }
284
BundleGpuAsm(std::vector<CubinOrPTXImage> images,const std::string preferred_cuda_dir)285 port::StatusOr<std::vector<uint8>> BundleGpuAsm(
286 std::vector<CubinOrPTXImage> images, const std::string preferred_cuda_dir) {
287 std::string fatbinary_path =
288 findCudaExecutable("fatbinary", preferred_cuda_dir);
289
290 // Write images to temporary files.
291 std::vector<std::string> image_paths;
292 auto env = tensorflow::Env::Default();
293 for (const CubinOrPTXImage& img : images) {
294 std::string img_path;
295 if (!env->LocalTempFilename(&img_path)) {
296 return port::InternalError(
297 "Could not get temporary filenames for images.");
298 }
299 TF_RETURN_IF_ERROR(tensorflow::WriteStringToFile(
300 env, img_path, std::string(img.bytes.begin(), img.bytes.end())));
301 VLOG(2) << "image written to " << img_path;
302 image_paths.push_back(std::move(img_path));
303 }
304 auto image_files_cleaner = tensorflow::gtl::MakeCleanup([&image_paths] {
305 for (const auto& path : image_paths) {
306 TF_CHECK_OK(tensorflow::Env::Default()->DeleteFile(path));
307 }
308 });
309
310 // Prepare temorary result file.
311 std::string result_path;
312 if (!env->LocalTempFilename(&result_path)) {
313 return port::InternalError(
314 "Could not get temporary filename for fatbin result.");
315 }
316 auto result_file_cleaner = tensorflow::gtl::MakeCleanup([&result_path] {
317 // This file may never be created, so the failure to delete it should not
318 // propagate to TF.
319 tensorflow::Env::Default()->DeleteFile(result_path).IgnoreError();
320 });
321
322 // Invoke fatbinary and collect its output.
323 tensorflow::SubProcess fatbinary;
324 std::vector<std::string> fatbinary_args = {
325 fatbinary_path, "--64", "--cmdline=--compile-only",
326 "--link", "--compress-all", absl::StrCat("--create=", result_path)};
327 assert(images.size() == image_paths.size());
328 for (int i = 0; i < images.size(); i++) {
329 fatbinary_args.push_back(absl::StrFormat(
330 "--image=profile=%s,file=%s", images[i].profile, image_paths[i]));
331 }
332 if (VLOG_IS_ON(3)) {
333 VLOG(3) << absl::StrJoin(fatbinary_args, " ");
334 }
335 fatbinary.SetProgram(fatbinary_path, fatbinary_args);
336 fatbinary.SetChannelAction(tensorflow::CHAN_STDERR, tensorflow::ACTION_PIPE);
337 if (!fatbinary.Start()) {
338 return port::InternalError("Failed to launch fatbinary.");
339 }
340 std::string stderr_output;
341 int exit_status = fatbinary.Communicate(
342 /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
343 if (exit_status != 0) {
344 return port::InternalError(absl::StrFormat(
345 "fatbinary exited with non-zero error code %d, output: %s", exit_status,
346 stderr_output));
347 }
348 if (!stderr_output.empty()) {
349 VLOG(2) << stderr_output;
350 }
351
352 // Read in the result and return it as a byte vector.
353 std::string result_blob;
354 TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(tensorflow::Env::Default(),
355 result_path, &result_blob));
356 return std::vector<uint8>(result_blob.begin(), result_blob.end());
357 }
358
findRocmExecutable(const std::string & binary_relative_path,const std::string & rocm_root_dir)359 static std::string findRocmExecutable(const std::string& binary_relative_path,
360 const std::string& rocm_root_dir) {
361 auto env = tensorflow::Env::Default();
362 std::string binary_path =
363 tensorflow::io::JoinPath(rocm_root_dir, binary_relative_path);
364 VLOG(2) << "Looking for " << binary_relative_path << " at " << rocm_root_dir;
365 if (!env->FileExists(binary_path).ok()) {
366 binary_path = absl::StrCat("<", binary_path, " - NOT FOUND>");
367 }
368 return binary_path;
369 }
370
BundleGpuAsm(std::vector<HsacoImage> images,const std::string rocm_root_dir)371 port::StatusOr<std::vector<uint8>> BundleGpuAsm(
372 std::vector<HsacoImage> images, const std::string rocm_root_dir) {
373 std::string clang_offload_bundler_path =
374 findRocmExecutable("llvm/bin/clang-offload-bundler", rocm_root_dir);
375
376 // Initialise the "--inputs" / "--targets" arguments for the
377 // clang-offload-bundler with a dummy file / host target triple...
378 // clang-offload-bundler requires 1 and only 1 host target triple
379 std::ostringstream inputs_list;
380 std::ostringstream targets_list;
381
382 inputs_list << "/dev/null";
383 targets_list << "host-x86_64-unknown-linux";
384
385 // Write images to temporary files.
386 std::vector<std::string> image_paths;
387 auto env = tensorflow::Env::Default();
388 for (const HsacoImage& img : images) {
389 std::string img_path;
390 if (!env->LocalTempFilename(&img_path)) {
391 return port::InternalError(
392 "Could not get temporary filenames for images.");
393 }
394 TF_RETURN_IF_ERROR(tensorflow::WriteStringToFile(
395 env, img_path, std::string(img.bytes.begin(), img.bytes.end())));
396 VLOG(2) << "image written to " << img_path;
397 inputs_list << "," << img_path;
398 targets_list << ",hip-amdgcn-amd-amdhsa-" << img.gfx_arch;
399 image_paths.push_back(std::move(img_path));
400 }
401 auto image_files_cleaner = tensorflow::gtl::MakeCleanup([&image_paths] {
402 for (const auto& path : image_paths) {
403 TF_CHECK_OK(tensorflow::Env::Default()->DeleteFile(path));
404 }
405 });
406
407 // Prepare temorary result file.
408 std::string result_path;
409 if (!env->LocalTempFilename(&result_path)) {
410 return port::InternalError(
411 "Could not get temporary filename for fatbin result.");
412 }
413 auto result_file_cleaner = tensorflow::gtl::MakeCleanup([&result_path] {
414 // This file may never be created, so the failure to delete it should not
415 // propagate to TF.
416 tensorflow::Env::Default()->DeleteFile(result_path).IgnoreError();
417 });
418
419 // Invoke clang_offload_bundler and collect its output.
420 tensorflow::SubProcess clang_offload_bundler;
421 std::vector<std::string> clang_offload_bundler_args = {
422 clang_offload_bundler_path, absl::StrCat("--inputs=", inputs_list.str()),
423 absl::StrCat("--targets=", targets_list.str()), "--type=o",
424 absl::StrCat("--outputs=", result_path)};
425 if (VLOG_IS_ON(3)) {
426 VLOG(3) << absl::StrJoin(clang_offload_bundler_args, " ");
427 }
428 clang_offload_bundler.SetProgram(clang_offload_bundler_path,
429 clang_offload_bundler_args);
430 clang_offload_bundler.SetChannelAction(tensorflow::CHAN_STDERR,
431 tensorflow::ACTION_PIPE);
432 if (!clang_offload_bundler.Start()) {
433 return port::InternalError("Failed to launch clang_offload_bundler.");
434 }
435 std::string stderr_output;
436 int exit_status = clang_offload_bundler.Communicate(
437 /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
438 if (exit_status != 0) {
439 return port::InternalError(absl::StrFormat(
440 "clang_offload_bundler exited with non-zero error code %d, output: %s",
441 exit_status, stderr_output));
442 }
443 if (!stderr_output.empty()) {
444 VLOG(2) << stderr_output;
445 }
446
447 // Read in the result and return it as a byte vector.
448 std::string result_blob;
449 TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(tensorflow::Env::Default(),
450 result_path, &result_blob));
451 return std::vector<uint8>(result_blob.begin(), result_blob.end());
452 }
453
454 } // namespace stream_executor
455