• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2019-2023 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_COMMON_H_
18 #define MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_COMMON_H_
19 
20 #include <cublas_v2.h>
21 #include <iostream>
22 #include <vector>
23 #include <string>
24 #include <algorithm>
25 #include <map>
26 #include <sstream>
27 #include "utils/log_adapter.h"
28 #include "utils/trace_base.h"
29 #include "utils/ms_utils.h"
30 #include "include/curand.h"
31 
32 namespace mindspore {
33 namespace device {
34 namespace gpu {
35 #define CHECK_OP_RET_WITH_EXCEPT(expression, message)                                                     \
36   do {                                                                                                    \
37     bool success = (expression);                                                                          \
38     if (!success) {                                                                                       \
39       MS_LOG(INTERNAL_EXCEPTION) << "#dmsg#Op Error:#dmsg#" << message << " | Error Number: " << success; \
40     }                                                                                                     \
41   } while (0);
42 
43 #define CHECK_OP_RET_WITH_EXCEPT_TRANCE(node, expression, message)              \
44   do {                                                                          \
45     bool success = (expression);                                                \
46     if (!success) {                                                             \
47       MS_LOG(INTERNAL_EXCEPTION) << "#dmsg#Op Error:#dmsg#" << message << " | " \
48                                  << trace::DumpSourceLines(node.lock());        \
49     }                                                                           \
50   } while (0);
51 
52 #define CHECK_OP_RET_WITH_ERROR(expression, message)                              \
53   do {                                                                            \
54     bool success = (expression);                                                  \
55     if (!success) {                                                               \
56       MS_LOG(ERROR) << "Op Error: " << message << " | Error Number: " << success; \
57     }                                                                             \
58   } while (0);
59 
60 #define CHECK_RET_WITH_RETURN_ERROR(expression, message) \
61   do {                                                   \
62     bool success = (expression);                         \
63     if (!success) {                                      \
64       MS_LOG(ERROR) << message;                          \
65       return false;                                      \
66     }                                                    \
67   } while (0);
68 
69 #define CHECK_CUDA_RET_WITH_ERROR(node, expression, message)                                                           \
70   do {                                                                                                                 \
71     cudaError_t status = (expression);                                                                                 \
72     if (status != cudaSuccess) {                                                                                       \
73       MS_LOG(ERROR) << "CUDA Error: " << message << " | Error Number: " << status << " " << cudaGetErrorString(status) \
74                     << trace::DumpSourceLines(node.lock(), false);                                                     \
75     }                                                                                                                  \
76   } while (0);
77 
78 #define CHECK_CUDA_RET_WITH_ERROR_NOTRACE(expression, message)                           \
79   do {                                                                                   \
80     cudaError_t status = (expression);                                                   \
81     if (status != cudaSuccess) {                                                         \
82       MS_LOG(ERROR) << "CUDA Error: " << message << " | Error Number: " << status << " " \
83                     << cudaGetErrorString(status);                                       \
84     }                                                                                    \
85   } while (0);
86 
87 #define CHECK_CUDA_RET_WITH_RETURN_ERROR_NOTRACE(expression, message)                    \
88   do {                                                                                   \
89     cudaError_t status = (expression);                                                   \
90     if (status != cudaSuccess) {                                                         \
91       MS_LOG(ERROR) << "CUDA Error: " << message << " | Error Number: " << status << " " \
92                     << cudaGetErrorString(status);                                       \
93       return false;                                                                      \
94     }                                                                                    \
95   } while (0);
96 
97 #define CHECK_CUDA_STATUS(status, kernel_name)                                                                 \
98   do {                                                                                                         \
99     if (status != cudaSuccess) {                                                                               \
100       if (status != cudaErrorNotReady && mindspore::common::GetEnv("CUDA_LAUNCH_BLOCKING") != "1") {           \
101         MS_LOG(EXCEPTION)                                                                                      \
102           << "The cuda Kernel fails to run, the error number is " << status << ", which means "                \
103           << cudaGetErrorString(status) << ". But the name of failed kernel is uncertain and the "             \
104           << "backtrace of error might be incorrect, since CUDA error might be asynchronously reported "       \
105           << "at some other function call. Please exporting CUDA_LAUNCH_BLOCKING=1 for more accurate "         \
106           << "error positioning.";                                                                             \
107       } else {                                                                                                 \
108         MS_LOG(EXCEPTION) << "For `" << kernel_name << "`, the cuda Kernel fails to run, the error number is " \
109                           << status << ", which means " << cudaGetErrorString(status) << ".";                  \
110       }                                                                                                        \
111     }                                                                                                          \
112   } while (0);
113 
114 #define CHECK_CUDA_RET_WITH_EXCEPT(node, expression, message)                                           \
115   do {                                                                                                  \
116     cudaError_t status = (expression);                                                                  \
117     if (status != cudaSuccess) {                                                                        \
118       MS_LOG(EXCEPTION) << "#umsg#CUDA Error:#umsg#" << message << " | Error Number: " << status << " " \
119                         << cudaGetErrorString(status) << trace::DumpSourceLines(node.lock());           \
120     }                                                                                                   \
121   } while (0);
122 
123 #define CHECK_CUDA_RET_WITH_EXCEPT_NOTRACE(expression, message)                                         \
124   do {                                                                                                  \
125     cudaError_t status = (expression);                                                                  \
126     if (status != cudaSuccess) {                                                                        \
127       MS_LOG(EXCEPTION) << "#umsg#CUDA Error:#umsg#" << message << " | Error Number: " << status << " " \
128                         << cudaGetErrorString(status);                                                  \
129     }                                                                                                   \
130   } while (0);
131 
132 #define CHECK_CUDNN_RET_WITH_EXCEPT(node, expression, message)                                           \
133   do {                                                                                                   \
134     cudnnStatus_t status = (expression);                                                                 \
135     if (status != CUDNN_STATUS_SUCCESS) {                                                                \
136       MS_LOG(EXCEPTION) << "#umsg#cuDNN Error:#umsg#" << message << " | Error Number: " << status << " " \
137                         << cudnnGetErrorString(status) << trace::DumpSourceLines(node.lock());           \
138     }                                                                                                    \
139   } while (0);
140 
141 #define CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(expression, message)                                         \
142   do {                                                                                                   \
143     cudnnStatus_t status = (expression);                                                                 \
144     if (status != CUDNN_STATUS_SUCCESS) {                                                                \
145       MS_LOG(EXCEPTION) << "#umsg#cuDNN Error:#umsg#" << message << " | Error Number: " << status << " " \
146                         << cudnnGetErrorString(status);                                                  \
147     }                                                                                                    \
148   } while (0);
149 
150 #define CHECK_CUDNN_RET_WITH_ERROR_NOTRACE(expression, message)                           \
151   do {                                                                                    \
152     cudnnStatus_t status = (expression);                                                  \
153     if (status != CUDNN_STATUS_SUCCESS) {                                                 \
154       MS_LOG(ERROR) << "cuDNN Error: " << message << " | Error Number: " << status << " " \
155                     << cudnnGetErrorString(status);                                       \
156     }                                                                                     \
157   } while (0);
158 
159 #define CHECK_CUDNN_RET_WITH_ERROR(node, expression, message)                                     \
160   do {                                                                                            \
161     cudnnStatus_t status = (expression);                                                          \
162     if (status != CUDNN_STATUS_SUCCESS) {                                                         \
163       MS_LOG(ERROR) << "cuDNN Error: " << message << " | Error Number: " << status << " "         \
164                     << cudnnGetErrorString(status) << trace::DumpSourceLines(node.lock(), false); \
165     }                                                                                             \
166   } while (0);
167 
168 #define CHECK_CUBLAS_RET_WITH_EXCEPT_NOTRACE(expression, message)                                         \
169   do {                                                                                                    \
170     cublasStatus_t status = (expression);                                                                 \
171     if (status != CUBLAS_STATUS_SUCCESS) {                                                                \
172       MS_LOG(EXCEPTION) << "#umsg#cuBLAS Error:#umsg#" << message << " | Error Number: " << status << " " \
173                         << mindspore::device::gpu::cuBlasGetErrorString(status);                          \
174     }                                                                                                     \
175   } while (0);
176 
177 #define CHECK_CUBLAS_RET_WITH_EXCEPT(node, expression, message)                                           \
178   do {                                                                                                    \
179     cublasStatus_t status = (expression);                                                                 \
180     if (status != CUBLAS_STATUS_SUCCESS) {                                                                \
181       MS_LOG(EXCEPTION) << "#umsg#cuBLAS Error:#umsg#" << message << " | Error Number: " << status << " " \
182                         << mindspore::device::gpu::cuBlasGetErrorString(status)                           \
183                         << trace::DumpSourceLines(node.lock());                                           \
184     }                                                                                                     \
185   } while (0);
186 
187 #define CHECK_CUBLAS_RET_WITH_ERROR(expression, message)                                   \
188   do {                                                                                     \
189     cublasStatus_t status = (expression);                                                  \
190     if (status != CUBLAS_STATUS_SUCCESS) {                                                 \
191       MS_LOG(ERROR) << "cuBLAS Error: " << message << " | Error Number: " << status << " " \
192                     << mindspore::device::gpu::cuBlasGetErrorString(status);               \
193     }                                                                                      \
194   } while (0);
195 
196 #define CHECK_CUSOLVER_RET_WITH_EXCEPT_NOTRACE(expression, message)                                   \
197   do {                                                                                                \
198     cusolverStatus_t status = (expression);                                                           \
199     if (status != CUSOLVER_STATUS_SUCCESS) {                                                          \
200       MS_LOG(EXCEPTION) << "#umsg#cusolver Error:#umsg#" << message << " | Error Number: " << status; \
201     }                                                                                                 \
202   } while (0);
203 
204 #define CHECK_CUSOLVER_RET_WITH_EXCEPT(node, expression, message)                                    \
205   do {                                                                                               \
206     cusolverStatus_t status = (expression);                                                          \
207     if (status != CUSOLVER_STATUS_SUCCESS) {                                                         \
208       MS_LOG(EXCEPTION) << "#umsg#cusolver Error:#umsg#" << message << " | Error Number: " << status \
209                         << trace::DumpSourceLines(node.lock());                                      \
210       ;                                                                                              \
211     }                                                                                                \
212   } while (0);
213 
214 #define CHECK_CUSOLVER_RET_WITH_ERROR(expression, message)                             \
215   do {                                                                                 \
216     cusolverStatus_t status = (expression);                                            \
217     if (status != CUSOLVER_STATUS_SUCCESS) {                                           \
218       MS_LOG(ERROR) << "cusolver Error: " << message << " | Error Number: " << status; \
219     }                                                                                  \
220   } while (0);
221 
222 #define CHECK_NCCL_RET_WITH_EXCEPT_NOTRACE(expression, message)                                   \
223   do {                                                                                            \
224     int result = (expression);                                                                    \
225     if (result != ncclSuccess) {                                                                  \
226       MS_LOG(EXCEPTION) << "#umsg#NCCL Error:#umsg#" << message << " | Error Number: " << result; \
227     }                                                                                             \
228   } while (0);
229 
230 #define CHECK_NCCL_RET_WITH_EXCEPT_NOTRACE(expression, message)                                   \
231   do {                                                                                            \
232     int result = (expression);                                                                    \
233     if (result != ncclSuccess) {                                                                  \
234       MS_LOG(EXCEPTION) << "#umsg#NCCL Error:#umsg#" << message << " | Error Number: " << result; \
235     }                                                                                             \
236   } while (0);
237 
238 #define CHECK_CUSPARSE_RET_WITH_ERROR(expression, message)                           \
239   do {                                                                               \
240     cusparseStatus_t result = (expression);                                          \
241     if (result != CUSPARSE_STATUS_SUCCESS) {                                         \
242       MS_LOG(ERROR) << "cusparse Error: " << message << " | Error Code: " << result; \
243     }                                                                                \
244   } while (0);
245 
246 #define CHECK_CUSPARSE_RET_WITH_EXCEPT(expression, message)                                         \
247   do {                                                                                              \
248     cusparseStatus_t result = (expression);                                                         \
249     if (result != CUSPARSE_STATUS_SUCCESS) {                                                        \
250       MS_LOG(EXCEPTION) << "#umsg#cusparse Error:#umsg#" << message << " | Error Code: " << result; \
251     }                                                                                               \
252   } while (0);
253 
254 #define VARIABLE_NOT_USED(var) \
255   { (void)(var); }
256 
CheckShapePositive(const std::vector<int64_t> & input_shape)257 inline bool CheckShapePositive(const std::vector<int64_t> &input_shape) {
258   if (input_shape.size() != 0) {
259     if (std::all_of(input_shape.begin(), input_shape.end(), [](int64_t i) { return i > 0; })) {
260       return true;
261     }
262   }
263   return false;
264 }
265 #define CHECK_SHAPE_POSITIVE(input_shape) mindspore::device::gpu::CheckShapePositive(input_shape)
266 
CurandGetErrorString(curandStatus_t status)267 inline const char *CurandGetErrorString(curandStatus_t status) {
268   switch (status) {
269     case CURAND_STATUS_VERSION_MISMATCH:
270       return "Header file and linked library version do not match.";
271     case CURAND_STATUS_NOT_INITIALIZED:
272       return "Generator not initialized.";
273     case CURAND_STATUS_ALLOCATION_FAILED:
274       return "Memory allocation failed.";
275     case CURAND_STATUS_TYPE_ERROR:
276       return "Generator is wrong type.";
277     case CURAND_STATUS_OUT_OF_RANGE:
278       return "Argument out of range.";
279     case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
280       return "Length requested is not a multiple of dimension.";
281     case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
282       return "GPU does not have double precision required by MRG32k3a.";
283     case CURAND_STATUS_LAUNCH_FAILURE:
284       return "Kernel launch failure.";
285     case CURAND_STATUS_PREEXISTING_FAILURE:
286       return "Preexisting failure on library entry.";
287     case CURAND_STATUS_INITIALIZATION_FAILED:
288       return "Initialization of CUDA failed.";
289     case CURAND_STATUS_ARCH_MISMATCH:
290       return "Architecture mismatch, GPU does not support requested feature.";
291     case CURAND_STATUS_INTERNAL_ERROR:
292       return "Internal library error.";
293     default:
294       return "Unknown the curandStatus.";
295   }
296 }
297 
cuBlasGetErrorString(cublasStatus_t status)298 inline const char *cuBlasGetErrorString(cublasStatus_t status) {
299   switch (status) {
300     case CUBLAS_STATUS_SUCCESS:
301       return "CUBLAS_STATUS_SUCCESS: The operation completed successfully.";
302     case CUBLAS_STATUS_NOT_INITIALIZED:
303       return "CUBLAS_STATUS_NOT_INITIALIZED: The cuBLAS library was not initialized.";
304     case CUBLAS_STATUS_ALLOC_FAILED:
305       return "CUBLAS_STATUS_ALLOC_FAILED: Resource allocation failed inside the cuBLAS library. This is usually caused "
306              "by a cudaMalloc() failure. ";
307     case CUBLAS_STATUS_INVALID_VALUE:
308       return "CUBLAS_STATUS_INVALID_VALUE: An unsupported value or parameter was passed to the function (a negative "
309              "vector size, for example).";
310     case CUBLAS_STATUS_ARCH_MISMATCH:
311       return "CUBLAS_STATUS_ARCH_MISMATCH: The function requires a feature absent from the device architecture; "
312              "usually caused by compute capability lower than 5.0.";
313     case CUBLAS_STATUS_MAPPING_ERROR:
314       return "CUBLAS_STATUS_MAPPING_ERROR: An access to GPU memory space failed, which is usually caused by a failure "
315              "to bind a texture.";
316     case CUBLAS_STATUS_EXECUTION_FAILED:
317       return "CUBLAS_STATUS_EXECUTION_FAILED: The GPU program failed to execute. This is often caused by a launch "
318              "failure of the kernel on the GPU, which can be caused by multiple reasons.";
319     case CUBLAS_STATUS_INTERNAL_ERROR:
320       return "CUBLAS_STATUS_INTERNAL_ERROR: An internal cuBLAS operation failed. This error is usually caused by a "
321              "cudaMemcpyAsync() failure. ";
322     case CUBLAS_STATUS_NOT_SUPPORTED:
323       return "CUBLAS_STATUS_NOT_SUPPORTED: The functionality requested is not supported.";
324     case CUBLAS_STATUS_LICENSE_ERROR:
325       return "CUBLAS_STATUS_LICENSE_ERROR: The functionality requested requires some license and an error was detected "
326              "when trying to check the current licensing. This error can happen if the license is not present or is "
327              "expired or if the environment variable NVIDIA_LICENSE_FILE is not set properly. ";
328     default:
329       return "Unknown cublasStatus.";
330   }
331 }
332 
333 #define CHECK_CURAND_RET_WITH_EXCEPT(expression, message)                                                      \
334   do {                                                                                                         \
335     curandStatus_t status = (expression);                                                                      \
336     if (status != CURAND_STATUS_SUCCESS) {                                                                     \
337       MS_LOG(EXCEPTION) << "#umsg#CUDA curand Error:#umsg#" << message << " | curandStatus: " << status << " " \
338                         << mindspore::device::gpu::CurandGetErrorString(status);                               \
339     }                                                                                                          \
340   } while (0);
341 }  // namespace gpu
342 }  // namespace device
343 }  // namespace mindspore
344 
345 #endif  // MINDSPORE_CCSRC_RUNTIME_DEVICE_GPU_GPU_COMMON_H_
346