1 /**
2 * Copyright 2019-2023 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "plugin/device/gpu/hal/device/cuda_driver.h"
18 #include <nvrtc.h>
19 #include "utils/log_adapter.h"
20 #include "include/common/utils/convert_utils.h"
21
22 namespace mindspore {
23 namespace device {
24 namespace gpu {
AllocDeviceMem(size_t size,DeviceMemPtr * addr)25 size_t CudaDriver::AllocDeviceMem(size_t size, DeviceMemPtr *addr) {
26 if (size <= 0) {
27 MS_LOG(EXCEPTION) << "#umsg#Cuda error:#umsg#The cudaMalloc alloc size is under 0.";
28 }
29 size_t retreat_count = 0;
30 auto ret = cudaMalloc(reinterpret_cast<void **>(addr), size);
31 // If free memory is not enough, then retry with mem_malloc_retry_rate_.
32 while (ret == cudaErrorMemoryAllocation) {
33 size = FloatToSize(size * mem_malloc_retry_rate_);
34 size = (size / mem_malloc_align_size_) * mem_malloc_align_size_;
35 ret = cudaMalloc(reinterpret_cast<void **>(addr), size);
36 retreat_count++;
37 if (retreat_count > mem_malloc_retry_conut_max_) {
38 break;
39 }
40 }
41
42 if (ret != cudaSuccess) {
43 MS_LOG(ERROR) << "cudaMalloc failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
44 return 0;
45 }
46 return size;
47 }
48
FreeDeviceMem(const DeviceMemPtr & addr)49 bool CudaDriver::FreeDeviceMem(const DeviceMemPtr &addr) {
50 if (addr == nullptr) {
51 return true;
52 }
53 auto ret = cudaFree(addr);
54 if (ret != cudaSuccess) {
55 MS_LOG(ERROR) << "cudaFree failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
56 return false;
57 }
58 return true;
59 }
60
AllocHostPinnedMem(size_t size,void ** addr)61 size_t CudaDriver::AllocHostPinnedMem(size_t size, void **addr) {
62 if (size == 0) {
63 MS_LOG(EXCEPTION) << "#umsg#Cuda error:#umsg#The cudaHostAlloc allocate size is 0";
64 }
65 auto ret = cudaHostAlloc(addr, size, cudaHostAllocDefault);
66 if (ret != cudaSuccess) {
67 MS_LOG(ERROR) << "cudaHostAlloc failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
68 return 0;
69 }
70 return size;
71 }
72
FreeHostPinnedMem(void * addr)73 void CudaDriver::FreeHostPinnedMem(void *addr) {
74 if (addr) {
75 auto ret = cudaFreeHost(addr);
76 if (ret != cudaSuccess) {
77 MS_LOG(EXCEPTION) << "#umsg#Cuda error:#umsg#The cudaFreeHost failed, ret[" << static_cast<int>(ret) << "], "
78 << cudaGetErrorString(ret);
79 }
80 }
81 }
82
CudaHostRegister(void * addr,size_t alloc_size)83 void CudaDriver::CudaHostRegister(void *addr, size_t alloc_size) {
84 MS_EXCEPTION_IF_NULL(addr);
85 auto ret = cudaHostRegister(addr, alloc_size, cudaHostRegisterDefault);
86 if (ret != cudaSuccess) {
87 MS_LOG(INFO) << "cudaHostRegister failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
88 }
89 }
90
CudaHostUnregister(void * addr)91 void CudaDriver::CudaHostUnregister(void *addr) {
92 MS_EXCEPTION_IF_NULL(addr);
93 auto ret = cudaHostUnregister(addr);
94 if (ret != cudaSuccess) {
95 MS_LOG(INFO) << "cudaHostUnregister failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
96 }
97 }
98
CopyHostMemToDevice(const DeviceMemPtr & dst,const void * src,size_t size)99 bool CudaDriver::CopyHostMemToDevice(const DeviceMemPtr &dst, const void *src, size_t size) {
100 auto ret = cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice);
101 if (ret != cudaSuccess) {
102 MS_LOG(ERROR) << "cudaMemcpy failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
103 return false;
104 }
105 return true;
106 }
107
CopyDeviceMemToHost(const HostMemPtr & dst,const DeviceMemPtr & src,size_t size)108 bool CudaDriver::CopyDeviceMemToHost(const HostMemPtr &dst, const DeviceMemPtr &src, size_t size) {
109 auto ret = cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost);
110 if (ret != cudaSuccess) {
111 MS_LOG(ERROR) << "cudaMemcpy failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
112 return false;
113 }
114 return true;
115 }
116
CopyHostMemToHost(const HostMemPtr & dst,const void * src,size_t size)117 bool CudaDriver::CopyHostMemToHost(const HostMemPtr &dst, const void *src, size_t size) {
118 auto ret = cudaMemcpy(dst, src, size, cudaMemcpyHostToHost);
119 if (ret != cudaSuccess) {
120 MS_LOG(ERROR) << "cudaMemcpy failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
121 return false;
122 }
123 return true;
124 }
125
CopyHostMemToDeviceAsync(const DeviceMemPtr & dst,const void * src,size_t size,CudaDeviceStream stream)126 bool CudaDriver::CopyHostMemToDeviceAsync(const DeviceMemPtr &dst, const void *src, size_t size,
127 CudaDeviceStream stream) {
128 auto ret = cudaMemcpyAsync(dst, src, size, cudaMemcpyHostToDevice, (cudaStream_t)stream);
129 if (ret != cudaSuccess) {
130 MS_LOG(ERROR) << "cudaMemcpyAsync failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
131 return false;
132 }
133 return true;
134 }
135
CopyDeviceMemToHostAsync(const HostMemPtr & dst,const void * src,size_t size,CudaDeviceStream stream)136 bool CudaDriver::CopyDeviceMemToHostAsync(const HostMemPtr &dst, const void *src, size_t size,
137 CudaDeviceStream stream) {
138 auto ret = cudaMemcpyAsync(dst, src, size, cudaMemcpyDeviceToHost, (cudaStream_t)stream);
139 if (ret != cudaSuccess) {
140 MS_LOG(ERROR) << "cudaMemcpyAsync failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
141 return false;
142 }
143 return true;
144 }
145
CopyDeviceMemToDeviceAsync(const DeviceMemPtr & dst,const void * src,size_t size,CudaDeviceStream stream)146 bool CudaDriver::CopyDeviceMemToDeviceAsync(const DeviceMemPtr &dst, const void *src, size_t size,
147 CudaDeviceStream stream) {
148 auto ret = cudaMemcpyAsync(dst, src, size, cudaMemcpyDeviceToDevice, (cudaStream_t)stream);
149 if (ret != cudaSuccess) {
150 MS_LOG(ERROR) << "cudaMemcpyAsync failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
151 return false;
152 }
153 return true;
154 }
155
total_mem_size()156 size_t CudaDriver::total_mem_size() {
157 size_t free;
158 size_t total;
159 auto ret = cudaMemGetInfo(&free, &total);
160 if (ret != cudaSuccess) {
161 MS_LOG(ERROR) << "cudaMemGetInfo failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
162 return 0;
163 }
164 return total;
165 }
166
free_mem_size()167 size_t CudaDriver::free_mem_size() {
168 size_t free;
169 size_t total;
170 auto ret = cudaMemGetInfo(&free, &total);
171 if (ret != cudaSuccess) {
172 MS_LOG(ERROR) << "cudaMemGetInfo failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
173 return 0;
174 }
175
176 return free;
177 }
178
CreateStream(CudaDeviceStream * stream)179 bool CudaDriver::CreateStream(CudaDeviceStream *stream) {
180 auto ret = cudaStreamCreateWithFlags(reinterpret_cast<CUstream_st **>(stream), cudaStreamNonBlocking);
181 if (ret != cudaSuccess) {
182 MS_LOG(EXCEPTION) << "#umsg#Cuda error:#umsg#The cudaStreamCreateWithFlags failed, ret[" << static_cast<int>(ret)
183 << "], " << cudaGetErrorString(ret);
184 }
185 return true;
186 }
187
CreateStreamWithPriority(CudaDeviceStream * stream,int priority)188 bool CudaDriver::CreateStreamWithPriority(CudaDeviceStream *stream, int priority) {
189 auto ret = cudaStreamCreateWithPriority(reinterpret_cast<CUstream_st **>(stream), cudaStreamNonBlocking, priority);
190 if (ret != cudaSuccess) {
191 MS_LOG(EXCEPTION) << "#umsg#Cuda error:#umsg#The cudaStreamCreateWithPriority failed, ret[" << static_cast<int>(ret)
192 << "], " << cudaGetErrorString(ret);
193 }
194 return true;
195 }
196
DestroyStream(const CudaDeviceStream & stream)197 bool CudaDriver::DestroyStream(const CudaDeviceStream &stream) {
198 auto ret = cudaStreamDestroy((cudaStream_t)stream);
199 if (ret != cudaSuccess) {
200 MS_LOG(ERROR) << "cudaStreamDestroy failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
201 return false;
202 }
203 return true;
204 }
205
SyncStream(const CudaDeviceStream & stream)206 bool CudaDriver::SyncStream(const CudaDeviceStream &stream) {
207 auto ret = cudaStreamSynchronize((cudaStream_t)stream);
208 if (ret != cudaSuccess) {
209 MS_LOG(ERROR) << "cudaStreamSynchronize failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
210 if (ret != cudaErrorNotReady && common::GetEnv("CUDA_LAUNCH_BLOCKING") != "1") {
211 MS_LOG(ERROR) << "The kernel name and backtrace in log might be incorrect, since CUDA error might be "
212 << "asynchronously reported at some other function call. Please exporting CUDA_LAUNCH_BLOCKING=1 "
213 << "for more accurate error positioning.";
214 }
215 return false;
216 }
217 return true;
218 }
219
QueryStream(const CudaDeviceStream & stream)220 bool CudaDriver::QueryStream(const CudaDeviceStream &stream) {
221 auto ret = cudaStreamQuery((cudaStream_t)stream);
222 if (ret != cudaSuccess) {
223 MS_LOG(DEBUG) << "Tasks on stream " << stream << " are not completed yet.";
224 return false;
225 }
226 MS_LOG(DEBUG) << "Tasks on stream " << stream << " are completed.";
227 return true;
228 }
229
ConstructEvent(CudaDeviceEvent * event,unsigned int flag)230 bool CudaDriver::ConstructEvent(CudaDeviceEvent *event, unsigned int flag) {
231 auto ret = cudaEventCreateWithFlags(reinterpret_cast<cudaEvent_t *>(event), flag);
232 if (ret != cudaSuccess) {
233 MS_LOG(ERROR) << "cudaEventCreateWithFlags failed, ret[" << static_cast<int>(ret) << "], "
234 << cudaGetErrorString(ret);
235 return false;
236 }
237 return true;
238 }
239
DestroyEvent(const CudaDeviceEvent & event)240 bool CudaDriver::DestroyEvent(const CudaDeviceEvent &event) {
241 auto ret = cudaEventDestroy((cudaEvent_t)event);
242 if (ret != cudaSuccess) {
243 MS_LOG(ERROR) << "cudaEventDestroy failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
244 return false;
245 }
246 return true;
247 }
248
RecordEvent(CudaDeviceEvent event,CudaDeviceStream stream)249 bool CudaDriver::RecordEvent(CudaDeviceEvent event, CudaDeviceStream stream) {
250 auto ret = cudaEventRecord((cudaEvent_t)event, (cudaStream_t)stream);
251 if (ret != cudaSuccess) {
252 MS_LOG(ERROR) << "cudaEventRecord failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
253 return false;
254 }
255 return true;
256 }
257
SyncEvent(const CudaDeviceEvent & event)258 bool CudaDriver::SyncEvent(const CudaDeviceEvent &event) {
259 auto ret = cudaEventSynchronize((cudaEvent_t)event);
260 if (ret != cudaSuccess) {
261 MS_LOG(ERROR) << "cudaEventSynchronize failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
262 return false;
263 }
264 return true;
265 }
266
QueryEvent(const CudaDeviceEvent & event)267 bool CudaDriver::QueryEvent(const CudaDeviceEvent &event) {
268 auto ret = cudaEventQuery((cudaEvent_t)event);
269 if (ret == cudaSuccess) {
270 return true;
271 } else if (ret == cudaErrorNotReady) {
272 return false;
273 } else {
274 MS_LOG(ERROR) << "cudaEventQuery failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
275 return false;
276 }
277 }
278
ElapsedTime(float * cost_time,const CudaDeviceEvent & start,const CudaDeviceEvent & end)279 bool CudaDriver::ElapsedTime(float *cost_time, const CudaDeviceEvent &start, const CudaDeviceEvent &end) {
280 auto ret = cudaEventElapsedTime(cost_time, (cudaEvent_t)start, (cudaEvent_t)end);
281 if (ret == cudaSuccess) {
282 return true;
283 } else {
284 MS_LOG(ERROR) << "cudaEventElapsedTime failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
285 return false;
286 }
287 }
288
device_count()289 int CudaDriver::device_count() {
290 auto last_error = cudaGetLastError();
291 if (last_error != cudaSuccess) {
292 MS_LOG(EXCEPTION) << "#umsg#Cuda error:#umsg#The cudaGetLastError[" << static_cast<int>(last_error) << "], "
293 << cudaGetErrorString(last_error);
294 }
295
296 int dev_count = 0;
297 auto ret = cudaGetDeviceCount(&dev_count);
298 if (ret != cudaSuccess) {
299 MS_LOG(EXCEPTION) << "#umsg#Cuda error:#umsg#The cudaGetDeviceCount failed, ret[" << static_cast<int>(ret) << "], "
300 << cudaGetErrorString(ret);
301 }
302 return dev_count;
303 }
304
SetDevice(int index)305 bool CudaDriver::SetDevice(int index) {
306 auto ret = cudaSetDevice(index);
307 if (ret != cudaSuccess) {
308 MS_LOG(EXCEPTION)
309 << "#umsg#Cuda error:#umsg#SetDevice for id:" << index << " failed, ret[" << static_cast<int>(ret) << "], "
310 << cudaGetErrorString(ret)
311 << ". Please make sure that the 'device_id' set in context is in the range:[0, total number of GPU). "
312 "If the environment variable 'CUDA_VISIBLE_DEVICES' is set, the total number of GPU will be the number set "
313 "in the environment variable 'CUDA_VISIBLE_DEVICES'. For example, if export CUDA_VISIBLE_DEVICES=4,5,6, the "
314 "'device_id' can be 0,1,2 at the moment, 'device_id' starts from 0, and 'device_id'=0 means using GPU of "
315 "number 4.";
316 }
317 int major = 0;
318 int minor = 0;
319 auto curtc_ret = nvrtcVersion(&major, &minor);
320 if (curtc_ret == nvrtcResult::NVRTC_SUCCESS) {
321 MS_LOG(DEBUG) << "NVRTC version is " << major << "." << minor;
322 }
323 return true;
324 }
325 } // namespace gpu
326 } // namespace device
327 } // namespace mindspore
328