• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/stream_executor/rocm/rocm_diagnostics.h"
17 
18 #include <dirent.h>
19 #include <limits.h>
20 #include <link.h>
21 #include <stddef.h>
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <sys/sysmacros.h>
26 #include <unistd.h>
27 
28 #include <algorithm>
29 #include <memory>
30 #include <vector>
31 
32 #include "absl/container/inlined_vector.h"
33 #include "absl/strings/str_cat.h"
34 #include "absl/strings/str_format.h"
35 #include "absl/strings/str_split.h"
36 #include "absl/strings/strip.h"
37 #include "tensorflow/stream_executor/lib/error.h"
38 #include "tensorflow/stream_executor/lib/numbers.h"
39 #include "tensorflow/stream_executor/lib/process_state.h"
40 #include "tensorflow/stream_executor/lib/status.h"
41 #include "tensorflow/stream_executor/platform/logging.h"
42 
43 namespace stream_executor {
44 namespace rocm {
45 
DriverVersionToString(DriverVersion version)46 string DriverVersionToString(DriverVersion version) {
47   return absl::StrFormat("%d.%d.%d", std::get<0>(version), std::get<1>(version),
48                          std::get<2>(version));
49 }
50 
DriverVersionStatusToString(port::StatusOr<DriverVersion> version)51 string DriverVersionStatusToString(port::StatusOr<DriverVersion> version) {
52   if (!version.ok()) {
53     return version.status().ToString();
54   }
55 
56   return DriverVersionToString(version.ValueOrDie());
57 }
58 
StringToDriverVersion(const string & value)59 port::StatusOr<DriverVersion> StringToDriverVersion(const string& value) {
60   std::vector<string> pieces = absl::StrSplit(value, '.');
61   if (pieces.size() != 2 && pieces.size() != 3) {
62     return port::Status{port::error::INVALID_ARGUMENT,
63                         absl::StrFormat("expected %%d.%%d or %%d.%%d.%%d form "
64                                         "for driver version; got \"%s\"",
65                                         value.c_str())};
66   }
67 
68   int major;
69   int minor;
70   int patch = 0;
71   if (!port::safe_strto32(pieces[0], &major)) {
72     return port::Status{
73         port::error::INVALID_ARGUMENT,
74         absl::StrFormat("could not parse major version number \"%s\" as an "
75                         "integer from string \"%s\"",
76                         pieces[0].c_str(), value.c_str())};
77   }
78   if (!port::safe_strto32(pieces[1], &minor)) {
79     return port::Status{
80         port::error::INVALID_ARGUMENT,
81         absl::StrFormat("could not parse minor version number \"%s\" as an "
82                         "integer from string \"%s\"",
83                         pieces[1].c_str(), value.c_str())};
84   }
85   if (pieces.size() == 3 && !port::safe_strto32(pieces[2], &patch)) {
86     return port::Status{
87         port::error::INVALID_ARGUMENT,
88         absl::StrFormat("could not parse patch version number \"%s\" as an "
89                         "integer from string \"%s\"",
90                         pieces[2].c_str(), value.c_str())};
91   }
92 
93   DriverVersion result{major, minor, patch};
94   VLOG(2) << "version string \"" << value << "\" made value "
95           << DriverVersionToString(result);
96   return result;
97 }
98 
99 }  // namespace rocm
100 }  // namespace stream_executor
101 
102 namespace stream_executor {
103 namespace gpu {
104 
105 // -- class Diagnostician
106 
GetDevNodePath(int dev_node_ordinal)107 string Diagnostician::GetDevNodePath(int dev_node_ordinal) {
108   return absl::StrCat("/dev/kfd", dev_node_ordinal);
109 }
110 
LogDiagnosticInformation()111 void Diagnostician::LogDiagnosticInformation() {
112   LOG(INFO) << "retrieving ROCM diagnostic information for host: "
113             << port::Hostname();
114 
115   LogDriverVersionInformation();
116 }
117 
LogDriverVersionInformation()118 /* static */ void Diagnostician::LogDriverVersionInformation() {
119   LOG(INFO) << "hostname: " << port::Hostname();
120   if (VLOG_IS_ON(1)) {
121     const char* value = getenv("LD_LIBRARY_PATH");
122     string library_path = value == nullptr ? "" : value;
123     VLOG(1) << "LD_LIBRARY_PATH is: \"" << library_path << "\"";
124 
125     std::vector<string> pieces = absl::StrSplit(library_path, ':');
126     for (const auto& piece : pieces) {
127       if (piece.empty()) {
128         continue;
129       }
130       DIR* dir = opendir(piece.c_str());
131       if (dir == nullptr) {
132         VLOG(1) << "could not open \"" << piece << "\"";
133         continue;
134       }
135       while (dirent* entity = readdir(dir)) {
136         VLOG(1) << piece << " :: " << entity->d_name;
137       }
138       closedir(dir);
139     }
140   }
141   port::StatusOr<DriverVersion> dso_version = FindDsoVersion();
142   LOG(INFO) << "librocm reported version is: "
143             << rocm::DriverVersionStatusToString(dso_version);
144 
145   port::StatusOr<DriverVersion> kernel_version = FindKernelDriverVersion();
146   LOG(INFO) << "kernel reported version is: "
147             << rocm::DriverVersionStatusToString(kernel_version);
148 
149   if (kernel_version.ok() && dso_version.ok()) {
150     WarnOnDsoKernelMismatch(dso_version, kernel_version);
151   }
152 }
153 
154 // Iterates through loaded DSOs with DlIteratePhdrCallback to find the
155 // driver-interfacing DSO version number. Returns it as a string.
FindDsoVersion()156 port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
157   port::StatusOr<DriverVersion> result{port::Status{
158       port::error::NOT_FOUND,
159       "was unable to find librocm.so DSO loaded into this program"}};
160 
161   // Callback used when iterating through DSOs. Looks for the driver-interfacing
162   // DSO and yields its version number into the callback data, when found.
163   auto iterate_phdr = [](struct dl_phdr_info* info, size_t size,
164                          void* data) -> int {
165     if (strstr(info->dlpi_name, "librocm.so.1")) {
166       VLOG(1) << "found DLL info with name: " << info->dlpi_name;
167       char resolved_path[PATH_MAX] = {0};
168       if (realpath(info->dlpi_name, resolved_path) == nullptr) {
169         return 0;
170       }
171       VLOG(1) << "found DLL info with resolved path: " << resolved_path;
172       const char* slash = rindex(resolved_path, '/');
173       if (slash == nullptr) {
174         return 0;
175       }
176       const char* so_suffix = ".so.";
177       const char* dot = strstr(slash, so_suffix);
178       if (dot == nullptr) {
179         return 0;
180       }
181       string dso_version = dot + strlen(so_suffix);
182       // TODO(b/22689637): Eliminate the explicit namespace if possible.
183       auto stripped_dso_version = absl::StripSuffix(dso_version, ".ld64");
184       auto result = static_cast<port::StatusOr<DriverVersion>*>(data);
185       *result = rocm::StringToDriverVersion(string(stripped_dso_version));
186       return 1;
187     }
188     return 0;
189   };
190 
191   dl_iterate_phdr(iterate_phdr, &result);
192 
193   return result;
194 }
195 
FindKernelModuleVersion(const string & driver_version_file_contents)196 port::StatusOr<DriverVersion> Diagnostician::FindKernelModuleVersion(
197     const string& driver_version_file_contents) {
198   static const char* kDriverFilePrelude = "Kernel Module  ";
199   size_t offset = driver_version_file_contents.find(kDriverFilePrelude);
200   if (offset == string::npos) {
201     return port::Status{
202         port::error::NOT_FOUND,
203         absl::StrCat("could not find kernel module information in "
204                      "driver version file contents: \"",
205                      driver_version_file_contents, "\"")};
206   }
207 
208   string version_and_rest = driver_version_file_contents.substr(
209       offset + strlen(kDriverFilePrelude), string::npos);
210   size_t space_index = version_and_rest.find(" ");
211   auto kernel_version = version_and_rest.substr(0, space_index);
212   // TODO(b/22689637): Eliminate the explicit namespace if possible.
213   auto stripped_kernel_version = absl::StripSuffix(kernel_version, ".ld64");
214   return rocm::StringToDriverVersion(string(stripped_kernel_version));
215 }
216 
WarnOnDsoKernelMismatch(port::StatusOr<DriverVersion> dso_version,port::StatusOr<DriverVersion> kernel_version)217 void Diagnostician::WarnOnDsoKernelMismatch(
218     port::StatusOr<DriverVersion> dso_version,
219     port::StatusOr<DriverVersion> kernel_version) {
220   if (kernel_version.ok() && dso_version.ok() &&
221       dso_version.ValueOrDie() == kernel_version.ValueOrDie()) {
222     LOG(INFO) << "kernel version seems to match DSO: "
223               << rocm::DriverVersionToString(kernel_version.ValueOrDie());
224   } else {
225     LOG(ERROR) << "kernel version "
226                << rocm::DriverVersionStatusToString(kernel_version)
227                << " does not match DSO version "
228                << rocm::DriverVersionStatusToString(dso_version)
229                << " -- cannot find working devices in this configuration";
230   }
231 }
232 
FindKernelDriverVersion()233 port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
234   auto status = port::Status{port::error::UNIMPLEMENTED,
235                              "kernel reported driver version not implemented"};
236   return status;
237 }
238 
239 }  // namespace gpu
240 }  // namespace stream_executor
241