1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/stream_executor/rocm/rocm_diagnostics.h"
17
18 #include <dirent.h>
19 #include <limits.h>
20 #include <link.h>
21 #include <stddef.h>
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <sys/sysmacros.h>
26 #include <unistd.h>
27
28 #include <algorithm>
29 #include <memory>
30 #include <vector>
31
32 #include "absl/container/inlined_vector.h"
33 #include "absl/strings/str_cat.h"
34 #include "absl/strings/str_format.h"
35 #include "absl/strings/str_split.h"
36 #include "absl/strings/strip.h"
37 #include "tensorflow/stream_executor/lib/error.h"
38 #include "tensorflow/stream_executor/lib/numbers.h"
39 #include "tensorflow/stream_executor/lib/process_state.h"
40 #include "tensorflow/stream_executor/lib/status.h"
41 #include "tensorflow/stream_executor/platform/logging.h"
42
43 namespace stream_executor {
44 namespace rocm {
45
DriverVersionToString(DriverVersion version)46 string DriverVersionToString(DriverVersion version) {
47 return absl::StrFormat("%d.%d.%d", std::get<0>(version), std::get<1>(version),
48 std::get<2>(version));
49 }
50
DriverVersionStatusToString(port::StatusOr<DriverVersion> version)51 string DriverVersionStatusToString(port::StatusOr<DriverVersion> version) {
52 if (!version.ok()) {
53 return version.status().ToString();
54 }
55
56 return DriverVersionToString(version.ValueOrDie());
57 }
58
StringToDriverVersion(const string & value)59 port::StatusOr<DriverVersion> StringToDriverVersion(const string& value) {
60 std::vector<string> pieces = absl::StrSplit(value, '.');
61 if (pieces.size() != 2 && pieces.size() != 3) {
62 return port::Status{port::error::INVALID_ARGUMENT,
63 absl::StrFormat("expected %%d.%%d or %%d.%%d.%%d form "
64 "for driver version; got \"%s\"",
65 value.c_str())};
66 }
67
68 int major;
69 int minor;
70 int patch = 0;
71 if (!port::safe_strto32(pieces[0], &major)) {
72 return port::Status{
73 port::error::INVALID_ARGUMENT,
74 absl::StrFormat("could not parse major version number \"%s\" as an "
75 "integer from string \"%s\"",
76 pieces[0].c_str(), value.c_str())};
77 }
78 if (!port::safe_strto32(pieces[1], &minor)) {
79 return port::Status{
80 port::error::INVALID_ARGUMENT,
81 absl::StrFormat("could not parse minor version number \"%s\" as an "
82 "integer from string \"%s\"",
83 pieces[1].c_str(), value.c_str())};
84 }
85 if (pieces.size() == 3 && !port::safe_strto32(pieces[2], &patch)) {
86 return port::Status{
87 port::error::INVALID_ARGUMENT,
88 absl::StrFormat("could not parse patch version number \"%s\" as an "
89 "integer from string \"%s\"",
90 pieces[2].c_str(), value.c_str())};
91 }
92
93 DriverVersion result{major, minor, patch};
94 VLOG(2) << "version string \"" << value << "\" made value "
95 << DriverVersionToString(result);
96 return result;
97 }
98
99 } // namespace rocm
100 } // namespace stream_executor
101
102 namespace stream_executor {
103 namespace gpu {
104
105 // -- class Diagnostician
106
GetDevNodePath(int dev_node_ordinal)107 string Diagnostician::GetDevNodePath(int dev_node_ordinal) {
108 return absl::StrCat("/dev/kfd", dev_node_ordinal);
109 }
110
LogDiagnosticInformation()111 void Diagnostician::LogDiagnosticInformation() {
112 LOG(INFO) << "retrieving ROCM diagnostic information for host: "
113 << port::Hostname();
114
115 LogDriverVersionInformation();
116 }
117
LogDriverVersionInformation()118 /* static */ void Diagnostician::LogDriverVersionInformation() {
119 LOG(INFO) << "hostname: " << port::Hostname();
120 if (VLOG_IS_ON(1)) {
121 const char* value = getenv("LD_LIBRARY_PATH");
122 string library_path = value == nullptr ? "" : value;
123 VLOG(1) << "LD_LIBRARY_PATH is: \"" << library_path << "\"";
124
125 std::vector<string> pieces = absl::StrSplit(library_path, ':');
126 for (const auto& piece : pieces) {
127 if (piece.empty()) {
128 continue;
129 }
130 DIR* dir = opendir(piece.c_str());
131 if (dir == nullptr) {
132 VLOG(1) << "could not open \"" << piece << "\"";
133 continue;
134 }
135 while (dirent* entity = readdir(dir)) {
136 VLOG(1) << piece << " :: " << entity->d_name;
137 }
138 closedir(dir);
139 }
140 }
141 port::StatusOr<DriverVersion> dso_version = FindDsoVersion();
142 LOG(INFO) << "librocm reported version is: "
143 << rocm::DriverVersionStatusToString(dso_version);
144
145 port::StatusOr<DriverVersion> kernel_version = FindKernelDriverVersion();
146 LOG(INFO) << "kernel reported version is: "
147 << rocm::DriverVersionStatusToString(kernel_version);
148
149 if (kernel_version.ok() && dso_version.ok()) {
150 WarnOnDsoKernelMismatch(dso_version, kernel_version);
151 }
152 }
153
154 // Iterates through loaded DSOs with DlIteratePhdrCallback to find the
155 // driver-interfacing DSO version number. Returns it as a string.
FindDsoVersion()156 port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
157 port::StatusOr<DriverVersion> result{port::Status{
158 port::error::NOT_FOUND,
159 "was unable to find librocm.so DSO loaded into this program"}};
160
161 // Callback used when iterating through DSOs. Looks for the driver-interfacing
162 // DSO and yields its version number into the callback data, when found.
163 auto iterate_phdr = [](struct dl_phdr_info* info, size_t size,
164 void* data) -> int {
165 if (strstr(info->dlpi_name, "librocm.so.1")) {
166 VLOG(1) << "found DLL info with name: " << info->dlpi_name;
167 char resolved_path[PATH_MAX] = {0};
168 if (realpath(info->dlpi_name, resolved_path) == nullptr) {
169 return 0;
170 }
171 VLOG(1) << "found DLL info with resolved path: " << resolved_path;
172 const char* slash = rindex(resolved_path, '/');
173 if (slash == nullptr) {
174 return 0;
175 }
176 const char* so_suffix = ".so.";
177 const char* dot = strstr(slash, so_suffix);
178 if (dot == nullptr) {
179 return 0;
180 }
181 string dso_version = dot + strlen(so_suffix);
182 // TODO(b/22689637): Eliminate the explicit namespace if possible.
183 auto stripped_dso_version = absl::StripSuffix(dso_version, ".ld64");
184 auto result = static_cast<port::StatusOr<DriverVersion>*>(data);
185 *result = rocm::StringToDriverVersion(string(stripped_dso_version));
186 return 1;
187 }
188 return 0;
189 };
190
191 dl_iterate_phdr(iterate_phdr, &result);
192
193 return result;
194 }
195
FindKernelModuleVersion(const string & driver_version_file_contents)196 port::StatusOr<DriverVersion> Diagnostician::FindKernelModuleVersion(
197 const string& driver_version_file_contents) {
198 static const char* kDriverFilePrelude = "Kernel Module ";
199 size_t offset = driver_version_file_contents.find(kDriverFilePrelude);
200 if (offset == string::npos) {
201 return port::Status{
202 port::error::NOT_FOUND,
203 absl::StrCat("could not find kernel module information in "
204 "driver version file contents: \"",
205 driver_version_file_contents, "\"")};
206 }
207
208 string version_and_rest = driver_version_file_contents.substr(
209 offset + strlen(kDriverFilePrelude), string::npos);
210 size_t space_index = version_and_rest.find(" ");
211 auto kernel_version = version_and_rest.substr(0, space_index);
212 // TODO(b/22689637): Eliminate the explicit namespace if possible.
213 auto stripped_kernel_version = absl::StripSuffix(kernel_version, ".ld64");
214 return rocm::StringToDriverVersion(string(stripped_kernel_version));
215 }
216
WarnOnDsoKernelMismatch(port::StatusOr<DriverVersion> dso_version,port::StatusOr<DriverVersion> kernel_version)217 void Diagnostician::WarnOnDsoKernelMismatch(
218 port::StatusOr<DriverVersion> dso_version,
219 port::StatusOr<DriverVersion> kernel_version) {
220 if (kernel_version.ok() && dso_version.ok() &&
221 dso_version.ValueOrDie() == kernel_version.ValueOrDie()) {
222 LOG(INFO) << "kernel version seems to match DSO: "
223 << rocm::DriverVersionToString(kernel_version.ValueOrDie());
224 } else {
225 LOG(ERROR) << "kernel version "
226 << rocm::DriverVersionStatusToString(kernel_version)
227 << " does not match DSO version "
228 << rocm::DriverVersionStatusToString(dso_version)
229 << " -- cannot find working devices in this configuration";
230 }
231 }
232
FindKernelDriverVersion()233 port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
234 auto status = port::Status{port::error::UNIMPLEMENTED,
235 "kernel reported driver version not implemented"};
236 return status;
237 }
238
239 } // namespace gpu
240 } // namespace stream_executor
241