1 /**
2 * Copyright 2019-2023 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_COMMON_H_
18 #define MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_COMMON_H_
19
20 #include <cublas_v2.h>
21 #include <iostream>
22 #include <vector>
23 #include <string>
24 #include <algorithm>
25 #include <map>
26 #include <sstream>
27 #include "utils/log_adapter.h"
28 #include "utils/trace_base.h"
29 #include "utils/ms_utils.h"
30 #include "include/curand.h"
31
32 namespace mindspore {
33 namespace device {
34 namespace gpu {
35 #define CHECK_OP_RET_WITH_EXCEPT(expression, message) \
36 do { \
37 bool success = (expression); \
38 if (!success) { \
39 MS_LOG(INTERNAL_EXCEPTION) << "#dmsg#Op Error:#dmsg#" << message << " | Error Number: " << success; \
40 } \
41 } while (0);
42
43 #define CHECK_OP_RET_WITH_EXCEPT_TRANCE(node, expression, message) \
44 do { \
45 bool success = (expression); \
46 if (!success) { \
47 MS_LOG(INTERNAL_EXCEPTION) << "#dmsg#Op Error:#dmsg#" << message << " | " \
48 << trace::DumpSourceLines(node.lock()); \
49 } \
50 } while (0);
51
52 #define CHECK_OP_RET_WITH_ERROR(expression, message) \
53 do { \
54 bool success = (expression); \
55 if (!success) { \
56 MS_LOG(ERROR) << "Op Error: " << message << " | Error Number: " << success; \
57 } \
58 } while (0);
59
60 #define CHECK_RET_WITH_RETURN_ERROR(expression, message) \
61 do { \
62 bool success = (expression); \
63 if (!success) { \
64 MS_LOG(ERROR) << message; \
65 return false; \
66 } \
67 } while (0);
68
69 #define CHECK_CUDA_RET_WITH_ERROR(node, expression, message) \
70 do { \
71 cudaError_t status = (expression); \
72 if (status != cudaSuccess) { \
73 MS_LOG(ERROR) << "CUDA Error: " << message << " | Error Number: " << status << " " << cudaGetErrorString(status) \
74 << trace::DumpSourceLines(node.lock(), false); \
75 } \
76 } while (0);
77
78 #define CHECK_CUDA_RET_WITH_ERROR_NOTRACE(expression, message) \
79 do { \
80 cudaError_t status = (expression); \
81 if (status != cudaSuccess) { \
82 MS_LOG(ERROR) << "CUDA Error: " << message << " | Error Number: " << status << " " \
83 << cudaGetErrorString(status); \
84 } \
85 } while (0);
86
87 #define CHECK_CUDA_RET_WITH_RETURN_ERROR_NOTRACE(expression, message) \
88 do { \
89 cudaError_t status = (expression); \
90 if (status != cudaSuccess) { \
91 MS_LOG(ERROR) << "CUDA Error: " << message << " | Error Number: " << status << " " \
92 << cudaGetErrorString(status); \
93 return false; \
94 } \
95 } while (0);
96
97 #define CHECK_CUDA_STATUS(status, kernel_name) \
98 do { \
99 if (status != cudaSuccess) { \
100 if (status != cudaErrorNotReady && mindspore::common::GetEnv("CUDA_LAUNCH_BLOCKING") != "1") { \
101 MS_LOG(EXCEPTION) \
102 << "The cuda Kernel fails to run, the error number is " << status << ", which means " \
103 << cudaGetErrorString(status) << ". But the name of failed kernel is uncertain and the " \
104 << "backtrace of error might be incorrect, since CUDA error might be asynchronously reported " \
105 << "at some other function call. Please exporting CUDA_LAUNCH_BLOCKING=1 for more accurate " \
106 << "error positioning."; \
107 } else { \
108 MS_LOG(EXCEPTION) << "For `" << kernel_name << "`, the cuda Kernel fails to run, the error number is " \
109 << status << ", which means " << cudaGetErrorString(status) << "."; \
110 } \
111 } \
112 } while (0);
113
114 #define CHECK_CUDA_RET_WITH_EXCEPT(node, expression, message) \
115 do { \
116 cudaError_t status = (expression); \
117 if (status != cudaSuccess) { \
118 MS_LOG(EXCEPTION) << "#umsg#CUDA Error:#umsg#" << message << " | Error Number: " << status << " " \
119 << cudaGetErrorString(status) << trace::DumpSourceLines(node.lock()); \
120 } \
121 } while (0);
122
123 #define CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(expression, message) \
124 do { \
125 cudaError_t status = (expression); \
126 if (status != cudaSuccess) { \
127 MS_LOG(EXCEPTION) << "#umsg#CUDA Error:#umsg#" << message << " | Error Number: " << status << " " \
128 << cudaGetErrorString(status); \
129 } \
130 } while (0);
131
132 #define CHECK_CUDNN_RET_WITH_EXCEPT(node, expression, message) \
133 do { \
134 cudnnStatus_t status = (expression); \
135 if (status != CUDNN_STATUS_SUCCESS) { \
136 MS_LOG(EXCEPTION) << "#umsg#cuDNN Error:#umsg#" << message << " | Error Number: " << status << " " \
137 << cudnnGetErrorString(status) << trace::DumpSourceLines(node.lock()); \
138 } \
139 } while (0);
140
141 #define CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(expression, message) \
142 do { \
143 cudnnStatus_t status = (expression); \
144 if (status != CUDNN_STATUS_SUCCESS) { \
145 MS_LOG(EXCEPTION) << "#umsg#cuDNN Error:#umsg#" << message << " | Error Number: " << status << " " \
146 << cudnnGetErrorString(status); \
147 } \
148 } while (0);
149
150 #define CHECK_CUDNN_RET_WITH_ERROR_NOTRACE(expression, message) \
151 do { \
152 cudnnStatus_t status = (expression); \
153 if (status != CUDNN_STATUS_SUCCESS) { \
154 MS_LOG(ERROR) << "cuDNN Error: " << message << " | Error Number: " << status << " " \
155 << cudnnGetErrorString(status); \
156 } \
157 } while (0);
158
159 #define CHECK_CUDNN_RET_WITH_ERROR(node, expression, message) \
160 do { \
161 cudnnStatus_t status = (expression); \
162 if (status != CUDNN_STATUS_SUCCESS) { \
163 MS_LOG(ERROR) << "cuDNN Error: " << message << " | Error Number: " << status << " " \
164 << cudnnGetErrorString(status) << trace::DumpSourceLines(node.lock(), false); \
165 } \
166 } while (0);
167
168 #define CHECK_CUBLAS_RET_WITH_EXCEPT_NOTRACE(expression, message) \
169 do { \
170 cublasStatus_t status = (expression); \
171 if (status != CUBLAS_STATUS_SUCCESS) { \
172 MS_LOG(EXCEPTION) << "#umsg#cuBLAS Error:#umsg#" << message << " | Error Number: " << status << " " \
173 << mindspore::device::gpu::cuBlasGetErrorString(status); \
174 } \
175 } while (0);
176
177 #define CHECK_CUBLAS_RET_WITH_EXCEPT(node, expression, message) \
178 do { \
179 cublasStatus_t status = (expression); \
180 if (status != CUBLAS_STATUS_SUCCESS) { \
181 MS_LOG(EXCEPTION) << "#umsg#cuBLAS Error:#umsg#" << message << " | Error Number: " << status << " " \
182 << mindspore::device::gpu::cuBlasGetErrorString(status) \
183 << trace::DumpSourceLines(node.lock()); \
184 } \
185 } while (0);
186
187 #define CHECK_CUBLAS_RET_WITH_ERROR(expression, message) \
188 do { \
189 cublasStatus_t status = (expression); \
190 if (status != CUBLAS_STATUS_SUCCESS) { \
191 MS_LOG(ERROR) << "cuBLAS Error: " << message << " | Error Number: " << status << " " \
192 << mindspore::device::gpu::cuBlasGetErrorString(status); \
193 } \
194 } while (0);
195
196 #define CHECK_CUSOLVER_RET_WITH_EXCEPT_NOTRACE(expression, message) \
197 do { \
198 cusolverStatus_t status = (expression); \
199 if (status != CUSOLVER_STATUS_SUCCESS) { \
200 MS_LOG(EXCEPTION) << "#umsg#cusolver Error:#umsg#" << message << " | Error Number: " << status; \
201 } \
202 } while (0);
203
204 #define CHECK_CUSOLVER_RET_WITH_EXCEPT(node, expression, message) \
205 do { \
206 cusolverStatus_t status = (expression); \
207 if (status != CUSOLVER_STATUS_SUCCESS) { \
208 MS_LOG(EXCEPTION) << "#umsg#cusolver Error:#umsg#" << message << " | Error Number: " << status \
209 << trace::DumpSourceLines(node.lock()); \
210 ; \
211 } \
212 } while (0);
213
214 #define CHECK_CUSOLVER_RET_WITH_ERROR(expression, message) \
215 do { \
216 cusolverStatus_t status = (expression); \
217 if (status != CUSOLVER_STATUS_SUCCESS) { \
218 MS_LOG(ERROR) << "cusolver Error: " << message << " | Error Number: " << status; \
219 } \
220 } while (0);
221
222 #define CHECK_NCCL_RET_WITH_EXCEPT_NOTRACE(expression, message) \
223 do { \
224 int result = (expression); \
225 if (result != ncclSuccess) { \
226 MS_LOG(EXCEPTION) << "#umsg#NCCL Error:#umsg#" << message << " | Error Number: " << result; \
227 } \
228 } while (0);
229
230 #define CHECK_NCCL_RET_WITH_EXCEPT_NOTRACE(expression, message) \
231 do { \
232 int result = (expression); \
233 if (result != ncclSuccess) { \
234 MS_LOG(EXCEPTION) << "#umsg#NCCL Error:#umsg#" << message << " | Error Number: " << result; \
235 } \
236 } while (0);
237
238 #define CHECK_CUSPARSE_RET_WITH_ERROR(expression, message) \
239 do { \
240 cusparseStatus_t result = (expression); \
241 if (result != CUSPARSE_STATUS_SUCCESS) { \
242 MS_LOG(ERROR) << "cusparse Error: " << message << " | Error Code: " << result; \
243 } \
244 } while (0);
245
246 #define CHECK_CUSPARSE_RET_WITH_EXCEPT(expression, message) \
247 do { \
248 cusparseStatus_t result = (expression); \
249 if (result != CUSPARSE_STATUS_SUCCESS) { \
250 MS_LOG(EXCEPTION) << "#umsg#cusparse Error:#umsg#" << message << " | Error Code: " << result; \
251 } \
252 } while (0);
253
254 #define VARIABLE_NOT_USED(var) \
255 { (void)(var); }
256
CheckShapePositive(const std::vector<int64_t> & input_shape)257 inline bool CheckShapePositive(const std::vector<int64_t> &input_shape) {
258 if (input_shape.size() != 0) {
259 if (std::all_of(input_shape.begin(), input_shape.end(), [](int64_t i) { return i > 0; })) {
260 return true;
261 }
262 }
263 return false;
264 }
265 #define CHECK_SHAPE_POSITIVE(input_shape) mindspore::device::gpu::CheckShapePositive(input_shape)
266
CurandGetErrorString(curandStatus_t status)267 inline const char *CurandGetErrorString(curandStatus_t status) {
268 switch (status) {
269 case CURAND_STATUS_VERSION_MISMATCH:
270 return "Header file and linked library version do not match.";
271 case CURAND_STATUS_NOT_INITIALIZED:
272 return "Generator not initialized.";
273 case CURAND_STATUS_ALLOCATION_FAILED:
274 return "Memory allocation failed.";
275 case CURAND_STATUS_TYPE_ERROR:
276 return "Generator is wrong type.";
277 case CURAND_STATUS_OUT_OF_RANGE:
278 return "Argument out of range.";
279 case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
280 return "Length requested is not a multiple of dimension.";
281 case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
282 return "GPU does not have double precision required by MRG32k3a.";
283 case CURAND_STATUS_LAUNCH_FAILURE:
284 return "Kernel launch failure.";
285 case CURAND_STATUS_PREEXISTING_FAILURE:
286 return "Preexisting failure on library entry.";
287 case CURAND_STATUS_INITIALIZATION_FAILED:
288 return "Initialization of CUDA failed.";
289 case CURAND_STATUS_ARCH_MISMATCH:
290 return "Architecture mismatch, GPU does not support requested feature.";
291 case CURAND_STATUS_INTERNAL_ERROR:
292 return "Internal library error.";
293 default:
294 return "Unknown the curandStatus.";
295 }
296 }
297
cuBlasGetErrorString(cublasStatus_t status)298 inline const char *cuBlasGetErrorString(cublasStatus_t status) {
299 switch (status) {
300 case CUBLAS_STATUS_SUCCESS:
301 return "CUBLAS_STATUS_SUCCESS: The operation completed successfully.";
302 case CUBLAS_STATUS_NOT_INITIALIZED:
303 return "CUBLAS_STATUS_NOT_INITIALIZED: The cuBLAS library was not initialized.";
304 case CUBLAS_STATUS_ALLOC_FAILED:
305 return "CUBLAS_STATUS_ALLOC_FAILED: Resource allocation failed inside the cuBLAS library. This is usually caused "
306 "by a cudaMalloc() failure. ";
307 case CUBLAS_STATUS_INVALID_VALUE:
308 return "CUBLAS_STATUS_INVALID_VALUE: An unsupported value or parameter was passed to the function (a negative "
309 "vector size, for example).";
310 case CUBLAS_STATUS_ARCH_MISMATCH:
311 return "CUBLAS_STATUS_ARCH_MISMATCH: The function requires a feature absent from the device architecture; "
312 "usually caused by compute capability lower than 5.0.";
313 case CUBLAS_STATUS_MAPPING_ERROR:
314 return "CUBLAS_STATUS_MAPPING_ERROR: An access to GPU memory space failed, which is usually caused by a failure "
315 "to bind a texture.";
316 case CUBLAS_STATUS_EXECUTION_FAILED:
317 return "CUBLAS_STATUS_EXECUTION_FAILED: The GPU program failed to execute. This is often caused by a launch "
318 "failure of the kernel on the GPU, which can be caused by multiple reasons.";
319 case CUBLAS_STATUS_INTERNAL_ERROR:
320 return "CUBLAS_STATUS_INTERNAL_ERROR: An internal cuBLAS operation failed. This error is usually caused by a "
321 "cudaMemcpyAsync() failure. ";
322 case CUBLAS_STATUS_NOT_SUPPORTED:
323 return "CUBLAS_STATUS_NOT_SUPPORTED: The functionality requested is not supported.";
324 case CUBLAS_STATUS_LICENSE_ERROR:
325 return "CUBLAS_STATUS_LICENSE_ERROR: The functionality requested requires some license and an error was detected "
326 "when trying to check the current licensing. This error can happen if the license is not present or is "
327 "expired or if the environment variable NVIDIA_LICENSE_FILE is not set properly. ";
328 default:
329 return "Unknown cublasStatus.";
330 }
331 }
332
333 #define CHECK_CURAND_RET_WITH_EXCEPT(expression, message) \
334 do { \
335 curandStatus_t status = (expression); \
336 if (status != CURAND_STATUS_SUCCESS) { \
337 MS_LOG(EXCEPTION) << "#umsg#CUDA curand Error:#umsg#" << message << " | curandStatus: " << status << " " \
338 << mindspore::device::gpu::CurandGetErrorString(status); \
339 } \
340 } while (0);
341 } // namespace gpu
342 } // namespace device
343 } // namespace mindspore
344
345 #endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_COMMON_H_
346