1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/stream_executor/gpu/asm_compiler.h"
17
18 #include "absl/container/flat_hash_map.h"
19 #include "absl/strings/str_format.h"
20 #include "absl/synchronization/mutex.h"
21 #include "tensorflow/core/lib/core/errors.h"
22 #include "tensorflow/core/lib/gtl/cleanup.h"
23 #include "tensorflow/core/lib/io/path.h"
24 #include "tensorflow/core/platform/cuda_libdevice_path.h"
25 #include "tensorflow/core/platform/env.h"
26 #include "tensorflow/core/platform/mutex.h"
27 #include "tensorflow/core/platform/regexp.h"
28 #include "tensorflow/core/platform/subprocess.h"
29 #include "tensorflow/stream_executor/gpu/gpu_driver.h"
30 #include "tensorflow/stream_executor/lib/statusor.h"
31
32 namespace stream_executor {
33
34 #if TENSORFLOW_USE_ROCM || defined(PLATFORM_WINDOWS)
35
CompileGpuAsm(int device_ordinal,const char * ptx_contents,GpuAsmOpts options)36 port::StatusOr<std::vector<uint8>> CompileGpuAsm(int device_ordinal,
37 const char* ptx_contents,
38 GpuAsmOpts options) {
39 // TODO(b/134675935): Subprocess invocation not supported on Windows.
40 return port::InternalError(
41 "Invoking GPU asm compilation is supported on Cuda non-Windows "
42 "platforms only");
43 }
44
CompileGpuAsmOrGetCached(int device_ordinal,const char * ptx,GpuAsmOpts compilation_options)45 port::StatusOr<absl::Span<const uint8>> CompileGpuAsmOrGetCached(
46 int device_ordinal, const char* ptx, GpuAsmOpts compilation_options) {
47 return CompileGpuAsm(device_ordinal, ptx, compilation_options);
48 }
49
50 #else
51
52 // Prints a warning if the ptxas at ptxas_path has known bugs.
53 //
54 // Only prints a warning the first time it's called for a particular value of
55 // ptxas_path.
56 //
57 // Locks on entry.
58 static void WarnIfBadPtxasVersion(const string& ptxas_path) {
59 static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED);
60 static std::unordered_set<string>* seen_ptxas_paths GUARDED_BY(mu) =
61 new std::unordered_set<string>();
62
63 tensorflow::mutex_lock lock(mu);
64 if (!seen_ptxas_paths->insert(ptxas_path).second) {
65 // Already checked this ptx binary, nothing to do.
66 return;
67 }
68
69 tensorflow::SubProcess ptxas;
70 ptxas.SetProgram(ptxas_path, {ptxas_path, "--version"});
71 ptxas.SetChannelAction(tensorflow::CHAN_STDOUT, tensorflow::ACTION_PIPE);
72 if (!ptxas.Start()) {
73 LOG(WARNING) << "Couldn't invoke " << ptxas_path << " --version";
74 return;
75 }
76
77 string out;
78 int exit_code = ptxas.Communicate(/*stdin_input=*/nullptr, &out,
79 /*stderr_output=*/nullptr);
80 if (exit_code != 0) {
81 LOG(WARNING) << "Running " << ptxas_path << " --version returned "
82 << exit_code;
83 return;
84 }
85
86 int64 vmaj, vmin, vdot;
87 string vmaj_str, vmin_str, vdot_str;
88 if (!RE2::PartialMatch(out, R"(\bV(\d+)\.(\d+)\.(\d+)\b)", &vmaj_str,
89 &vmin_str, &vdot_str) ||
90 !absl::SimpleAtoi(vmaj_str, &vmaj) ||
91 !absl::SimpleAtoi(vmin_str, &vmin) ||
92 !absl::SimpleAtoi(vdot_str, &vdot)) {
93 LOG(WARNING) << "Couldn't parse ptxas version in output of " << ptxas_path
94 << " --version:\n"
95 << out;
96 return;
97 }
98
99 // We need ptxas >= 9.0 as a hard requirement, because we compile targeting
100 // PTX 6.0. An older ptxas will just fail to compile any of our code.
101 //
102 // ptxas 9.0 before 9.0.276 and ptxas 9.1 before 9.1.121 miscompile some
103 // address calculations with large offsets (e.g. "load ptr + large_constant"),
104 // b/70245379.
105 //
106 // ptxas 9.1.121 miscompiles some large multioutput fusions, again in a way
107 // that appears related to address calculations, b/111107644. ptxas 9.2.88
108 // appears to work, as far as we can tell.
109 if (vmaj < 9) {
110 LOG(ERROR)
111 << "You are using ptxas 8.x, but TF requires ptxas 9.x (and strongly "
112 "prefers >= 9.2.88). Compilation of XLA kernels below will likely "
113 "fail.\n\nYou do not need to update CUDA; cherry-picking the ptxas "
114 "binary is sufficient.";
115 } else if (std::make_tuple(vmaj, vmin, vdot) < std::make_tuple(9, 2, 88)) {
116 LOG(WARNING)
117 << "*** WARNING *** You are using ptxas " << vmaj << "." << vmin << "."
118 << vdot
119 << ", which is older than 9.2.88. ptxas 9.x before 9.2.88 is known to "
120 "miscompile XLA code, leading to incorrect results or "
121 "invalid-address errors.\n\nYou do not need to update to CUDA "
122 "9.2.88; cherry-picking the ptxas binary is sufficient.";
123 }
124 }
125
126 port::StatusOr<absl::Span<const uint8>> CompileGpuAsmOrGetCached(
127 int device_ordinal, const char* ptx, GpuAsmOpts compilation_options) {
128 using PtxCacheKey = std::tuple<int, std::string, GpuAsmOpts::PtxOptionsTuple>;
129 static tensorflow::mutex ptx_cache_mutex(tensorflow::LINKER_INITIALIZED);
130 static auto& ptx_cache GUARDED_BY(ptx_cache_mutex) =
131 *new absl::flat_hash_map<PtxCacheKey, std::vector<uint8>>();
132
133 tensorflow::mutex_lock lock(ptx_cache_mutex);
134 PtxCacheKey cache_key{device_ordinal, std::string(ptx),
135 compilation_options.ToTuple()};
136 auto it = ptx_cache.find(cache_key);
137 if (it == ptx_cache.end()) {
138 TF_ASSIGN_OR_RETURN(
139 std::vector<uint8> compiled,
140 CompileGpuAsm(device_ordinal, ptx, compilation_options));
141 it = ptx_cache.emplace(cache_key, std::move(compiled)).first;
142 }
143
144 CHECK(it != ptx_cache.end());
145 const std::vector<uint8>& compiled = it->second;
146 return absl::MakeSpan(compiled);
147 }
148
149 port::StatusOr<std::vector<uint8>> CompileGpuAsm(int device_ordinal,
150 const char* ptx_contents,
151 GpuAsmOpts options) {
152 gpu::GpuDeviceHandle handle;
153 TF_RETURN_IF_ERROR(gpu::GpuDriver::GetDevice(device_ordinal, &handle));
154 int cc_major;
155 int cc_minor;
156 TF_RETURN_IF_ERROR(
157 gpu::GpuDriver::GetComputeCapability(&cc_major, &cc_minor, handle));
158
159 string ptxas_path;
160 auto env = tensorflow::Env::Default();
161 for (const string& cuda_root :
162 tensorflow::CandidateCudaRoots(options.preferred_cuda_dir)) {
163 ptxas_path = tensorflow::io::JoinPath(cuda_root, "bin", "ptxas");
164 VLOG(2) << "Looking for ptxas at " << ptxas_path;
165 if (env->FileExists(ptxas_path).ok()) {
166 break;
167 }
168 }
169 if (!env->FileExists(ptxas_path).ok()) {
170 // Rely on subprocess invocation to find the correct binary.
171 ptxas_path = "ptxas";
172 }
173 VLOG(2) << "Using ptxas at " << ptxas_path;
174
175 WarnIfBadPtxasVersion(ptxas_path);
176
177 // Write ptx into a temporary file.
178 string ptx_path;
179 if (!env->LocalTempFilename(&ptx_path)) {
180 return port::InternalError("couldn't get temp PTX file name");
181 }
182 auto ptx_cleaner = tensorflow::gtl::MakeCleanup([&ptx_path] {
183 TF_CHECK_OK(tensorflow::Env::Default()->DeleteFile(ptx_path));
184 });
185
186 TF_RETURN_IF_ERROR(
187 tensorflow::WriteStringToFile(env, ptx_path, ptx_contents));
188 VLOG(2) << "ptx written to: " << ptx_path;
189
190 // Invoke ptxas and collect its output.
191 string cubin_path;
192 if (!env->LocalTempFilename(&cubin_path)) {
193 return port::InternalError("couldn't get temp CUBIN file name");
194 }
195 auto cubin_cleaner = tensorflow::gtl::MakeCleanup([&cubin_path] {
196 // CUBIN file may never be created, so the failure to delete it should not
197 // produce TF error.
198 tensorflow::Env::Default()->DeleteFile(cubin_path).IgnoreError();
199 });
200 tensorflow::SubProcess ptxas_info_dumper;
201 std::vector<string> ptxas_args = {
202 ptxas_path, ptx_path, "-o", cubin_path,
203 absl::StrCat("-arch=sm_", cc_major, cc_minor)};
204 if (VLOG_IS_ON(2)) {
205 ptxas_args.push_back("-v");
206 }
207 if (options.disable_gpuasm_optimizations) {
208 ptxas_args.push_back("-O0");
209 }
210 ptxas_info_dumper.SetProgram(ptxas_path, ptxas_args);
211 ptxas_info_dumper.SetChannelAction(tensorflow::CHAN_STDERR,
212 tensorflow::ACTION_PIPE);
213 if (!ptxas_info_dumper.Start()) {
214 return port::InternalError("Failed to launch ptxas");
215 }
216 string stderr_output;
217 int exit_status = ptxas_info_dumper.Communicate(
218 /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
219 if (exit_status != 0) {
220 return port::InternalError(
221 absl::StrFormat("ptxas exited with non-zero error code %d, output: %s",
222 exit_status, stderr_output));
223 }
224
225 // Read in the result of compilation and return it as a byte vector.
226 string cubin;
227 TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(tensorflow::Env::Default(),
228 cubin_path, &cubin));
229 std::vector<uint8> cubin_vector(cubin.begin(), cubin.end());
230 return cubin_vector;
231 }
232
233 #endif
234
235 } // namespace stream_executor
236