• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2019-2023 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "plugin/device/gpu/hal/device/cuda_driver.h"
18 #include <nvrtc.h>
19 #include "utils/log_adapter.h"
20 #include "include/common/utils/convert_utils.h"
21 
22 namespace mindspore {
23 namespace device {
24 namespace gpu {
AllocDeviceMem(size_t size,DeviceMemPtr * addr)25 size_t CudaDriver::AllocDeviceMem(size_t size, DeviceMemPtr *addr) {
26   if (size <= 0) {
27     MS_LOG(EXCEPTION) << "#umsg#Cuda error:#umsg#The cudaMalloc alloc size is under 0.";
28   }
29   size_t retreat_count = 0;
30   auto ret = cudaMalloc(reinterpret_cast<void **>(addr), size);
31   // If free memory is not enough, then retry with mem_malloc_retry_rate_.
32   while (ret == cudaErrorMemoryAllocation) {
33     size = FloatToSize(size * mem_malloc_retry_rate_);
34     size = (size / mem_malloc_align_size_) * mem_malloc_align_size_;
35     ret = cudaMalloc(reinterpret_cast<void **>(addr), size);
36     retreat_count++;
37     if (retreat_count > mem_malloc_retry_conut_max_) {
38       break;
39     }
40   }
41 
42   if (ret != cudaSuccess) {
43     MS_LOG(ERROR) << "cudaMalloc failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
44     return 0;
45   }
46   return size;
47 }
48 
FreeDeviceMem(const DeviceMemPtr & addr)49 bool CudaDriver::FreeDeviceMem(const DeviceMemPtr &addr) {
50   if (addr == nullptr) {
51     return true;
52   }
53   auto ret = cudaFree(addr);
54   if (ret != cudaSuccess) {
55     MS_LOG(ERROR) << "cudaFree failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
56     return false;
57   }
58   return true;
59 }
60 
AllocHostPinnedMem(size_t size,void ** addr)61 size_t CudaDriver::AllocHostPinnedMem(size_t size, void **addr) {
62   if (size == 0) {
63     MS_LOG(EXCEPTION) << "#umsg#Cuda error:#umsg#The cudaHostAlloc allocate size is 0";
64   }
65   auto ret = cudaHostAlloc(addr, size, cudaHostAllocDefault);
66   if (ret != cudaSuccess) {
67     MS_LOG(ERROR) << "cudaHostAlloc failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
68     return 0;
69   }
70   return size;
71 }
72 
FreeHostPinnedMem(void * addr)73 void CudaDriver::FreeHostPinnedMem(void *addr) {
74   if (addr) {
75     auto ret = cudaFreeHost(addr);
76     if (ret != cudaSuccess) {
77       MS_LOG(EXCEPTION) << "#umsg#Cuda error:#umsg#The cudaFreeHost failed, ret[" << static_cast<int>(ret) << "], "
78                         << cudaGetErrorString(ret);
79     }
80   }
81 }
82 
CudaHostRegister(void * addr,size_t alloc_size)83 void CudaDriver::CudaHostRegister(void *addr, size_t alloc_size) {
84   MS_EXCEPTION_IF_NULL(addr);
85   auto ret = cudaHostRegister(addr, alloc_size, cudaHostRegisterDefault);
86   if (ret != cudaSuccess) {
87     MS_LOG(INFO) << "cudaHostRegister failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
88   }
89 }
90 
CudaHostUnregister(void * addr)91 void CudaDriver::CudaHostUnregister(void *addr) {
92   MS_EXCEPTION_IF_NULL(addr);
93   auto ret = cudaHostUnregister(addr);
94   if (ret != cudaSuccess) {
95     MS_LOG(INFO) << "cudaHostUnregister failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
96   }
97 }
98 
CopyHostMemToDevice(const DeviceMemPtr & dst,const void * src,size_t size)99 bool CudaDriver::CopyHostMemToDevice(const DeviceMemPtr &dst, const void *src, size_t size) {
100   auto ret = cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice);
101   if (ret != cudaSuccess) {
102     MS_LOG(ERROR) << "cudaMemcpy failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
103     return false;
104   }
105   return true;
106 }
107 
CopyDeviceMemToHost(const HostMemPtr & dst,const DeviceMemPtr & src,size_t size)108 bool CudaDriver::CopyDeviceMemToHost(const HostMemPtr &dst, const DeviceMemPtr &src, size_t size) {
109   auto ret = cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost);
110   if (ret != cudaSuccess) {
111     MS_LOG(ERROR) << "cudaMemcpy failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
112     return false;
113   }
114   return true;
115 }
116 
CopyHostMemToHost(const HostMemPtr & dst,const void * src,size_t size)117 bool CudaDriver::CopyHostMemToHost(const HostMemPtr &dst, const void *src, size_t size) {
118   auto ret = cudaMemcpy(dst, src, size, cudaMemcpyHostToHost);
119   if (ret != cudaSuccess) {
120     MS_LOG(ERROR) << "cudaMemcpy failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
121     return false;
122   }
123   return true;
124 }
125 
CopyHostMemToDeviceAsync(const DeviceMemPtr & dst,const void * src,size_t size,CudaDeviceStream stream)126 bool CudaDriver::CopyHostMemToDeviceAsync(const DeviceMemPtr &dst, const void *src, size_t size,
127                                           CudaDeviceStream stream) {
128   auto ret = cudaMemcpyAsync(dst, src, size, cudaMemcpyHostToDevice, (cudaStream_t)stream);
129   if (ret != cudaSuccess) {
130     MS_LOG(ERROR) << "cudaMemcpyAsync failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
131     return false;
132   }
133   return true;
134 }
135 
CopyDeviceMemToHostAsync(const HostMemPtr & dst,const void * src,size_t size,CudaDeviceStream stream)136 bool CudaDriver::CopyDeviceMemToHostAsync(const HostMemPtr &dst, const void *src, size_t size,
137                                           CudaDeviceStream stream) {
138   auto ret = cudaMemcpyAsync(dst, src, size, cudaMemcpyDeviceToHost, (cudaStream_t)stream);
139   if (ret != cudaSuccess) {
140     MS_LOG(ERROR) << "cudaMemcpyAsync failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
141     return false;
142   }
143   return true;
144 }
145 
CopyDeviceMemToDeviceAsync(const DeviceMemPtr & dst,const void * src,size_t size,CudaDeviceStream stream)146 bool CudaDriver::CopyDeviceMemToDeviceAsync(const DeviceMemPtr &dst, const void *src, size_t size,
147                                             CudaDeviceStream stream) {
148   auto ret = cudaMemcpyAsync(dst, src, size, cudaMemcpyDeviceToDevice, (cudaStream_t)stream);
149   if (ret != cudaSuccess) {
150     MS_LOG(ERROR) << "cudaMemcpyAsync failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
151     return false;
152   }
153   return true;
154 }
155 
total_mem_size()156 size_t CudaDriver::total_mem_size() {
157   size_t free;
158   size_t total;
159   auto ret = cudaMemGetInfo(&free, &total);
160   if (ret != cudaSuccess) {
161     MS_LOG(ERROR) << "cudaMemGetInfo failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
162     return 0;
163   }
164   return total;
165 }
166 
free_mem_size()167 size_t CudaDriver::free_mem_size() {
168   size_t free;
169   size_t total;
170   auto ret = cudaMemGetInfo(&free, &total);
171   if (ret != cudaSuccess) {
172     MS_LOG(ERROR) << "cudaMemGetInfo failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
173     return 0;
174   }
175 
176   return free;
177 }
178 
CreateStream(CudaDeviceStream * stream)179 bool CudaDriver::CreateStream(CudaDeviceStream *stream) {
180   auto ret = cudaStreamCreateWithFlags(reinterpret_cast<CUstream_st **>(stream), cudaStreamNonBlocking);
181   if (ret != cudaSuccess) {
182     MS_LOG(EXCEPTION) << "#umsg#Cuda error:#umsg#The cudaStreamCreateWithFlags failed, ret[" << static_cast<int>(ret)
183                       << "], " << cudaGetErrorString(ret);
184   }
185   return true;
186 }
187 
CreateStreamWithPriority(CudaDeviceStream * stream,int priority)188 bool CudaDriver::CreateStreamWithPriority(CudaDeviceStream *stream, int priority) {
189   auto ret = cudaStreamCreateWithPriority(reinterpret_cast<CUstream_st **>(stream), cudaStreamNonBlocking, priority);
190   if (ret != cudaSuccess) {
191     MS_LOG(EXCEPTION) << "#umsg#Cuda error:#umsg#The cudaStreamCreateWithPriority failed, ret[" << static_cast<int>(ret)
192                       << "], " << cudaGetErrorString(ret);
193   }
194   return true;
195 }
196 
DestroyStream(const CudaDeviceStream & stream)197 bool CudaDriver::DestroyStream(const CudaDeviceStream &stream) {
198   auto ret = cudaStreamDestroy((cudaStream_t)stream);
199   if (ret != cudaSuccess) {
200     MS_LOG(ERROR) << "cudaStreamDestroy failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
201     return false;
202   }
203   return true;
204 }
205 
SyncStream(const CudaDeviceStream & stream)206 bool CudaDriver::SyncStream(const CudaDeviceStream &stream) {
207   auto ret = cudaStreamSynchronize((cudaStream_t)stream);
208   if (ret != cudaSuccess) {
209     MS_LOG(ERROR) << "cudaStreamSynchronize failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
210     if (ret != cudaErrorNotReady && common::GetEnv("CUDA_LAUNCH_BLOCKING") != "1") {
211       MS_LOG(ERROR) << "The kernel name and backtrace in log might be incorrect, since CUDA error might be "
212                     << "asynchronously reported at some other function call. Please exporting CUDA_LAUNCH_BLOCKING=1 "
213                     << "for more accurate error positioning.";
214     }
215     return false;
216   }
217   return true;
218 }
219 
QueryStream(const CudaDeviceStream & stream)220 bool CudaDriver::QueryStream(const CudaDeviceStream &stream) {
221   auto ret = cudaStreamQuery((cudaStream_t)stream);
222   if (ret != cudaSuccess) {
223     MS_LOG(DEBUG) << "Tasks on stream " << stream << " are not completed yet.";
224     return false;
225   }
226   MS_LOG(DEBUG) << "Tasks on stream " << stream << " are completed.";
227   return true;
228 }
229 
ConstructEvent(CudaDeviceEvent * event,unsigned int flag)230 bool CudaDriver::ConstructEvent(CudaDeviceEvent *event, unsigned int flag) {
231   auto ret = cudaEventCreateWithFlags(reinterpret_cast<cudaEvent_t *>(event), flag);
232   if (ret != cudaSuccess) {
233     MS_LOG(ERROR) << "cudaEventCreateWithFlags failed, ret[" << static_cast<int>(ret) << "], "
234                   << cudaGetErrorString(ret);
235     return false;
236   }
237   return true;
238 }
239 
DestroyEvent(const CudaDeviceEvent & event)240 bool CudaDriver::DestroyEvent(const CudaDeviceEvent &event) {
241   auto ret = cudaEventDestroy((cudaEvent_t)event);
242   if (ret != cudaSuccess) {
243     MS_LOG(ERROR) << "cudaEventDestroy failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
244     return false;
245   }
246   return true;
247 }
248 
RecordEvent(CudaDeviceEvent event,CudaDeviceStream stream)249 bool CudaDriver::RecordEvent(CudaDeviceEvent event, CudaDeviceStream stream) {
250   auto ret = cudaEventRecord((cudaEvent_t)event, (cudaStream_t)stream);
251   if (ret != cudaSuccess) {
252     MS_LOG(ERROR) << "cudaEventRecord failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
253     return false;
254   }
255   return true;
256 }
257 
SyncEvent(const CudaDeviceEvent & event)258 bool CudaDriver::SyncEvent(const CudaDeviceEvent &event) {
259   auto ret = cudaEventSynchronize((cudaEvent_t)event);
260   if (ret != cudaSuccess) {
261     MS_LOG(ERROR) << "cudaEventSynchronize failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
262     return false;
263   }
264   return true;
265 }
266 
QueryEvent(const CudaDeviceEvent & event)267 bool CudaDriver::QueryEvent(const CudaDeviceEvent &event) {
268   auto ret = cudaEventQuery((cudaEvent_t)event);
269   if (ret == cudaSuccess) {
270     return true;
271   } else if (ret == cudaErrorNotReady) {
272     return false;
273   } else {
274     MS_LOG(ERROR) << "cudaEventQuery failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
275     return false;
276   }
277 }
278 
ElapsedTime(float * cost_time,const CudaDeviceEvent & start,const CudaDeviceEvent & end)279 bool CudaDriver::ElapsedTime(float *cost_time, const CudaDeviceEvent &start, const CudaDeviceEvent &end) {
280   auto ret = cudaEventElapsedTime(cost_time, (cudaEvent_t)start, (cudaEvent_t)end);
281   if (ret == cudaSuccess) {
282     return true;
283   } else {
284     MS_LOG(ERROR) << "cudaEventElapsedTime failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
285     return false;
286   }
287 }
288 
device_count()289 int CudaDriver::device_count() {
290   auto last_error = cudaGetLastError();
291   if (last_error != cudaSuccess) {
292     MS_LOG(EXCEPTION) << "#umsg#Cuda error:#umsg#The cudaGetLastError[" << static_cast<int>(last_error) << "], "
293                       << cudaGetErrorString(last_error);
294   }
295 
296   int dev_count = 0;
297   auto ret = cudaGetDeviceCount(&dev_count);
298   if (ret != cudaSuccess) {
299     MS_LOG(EXCEPTION) << "#umsg#Cuda error:#umsg#The cudaGetDeviceCount failed, ret[" << static_cast<int>(ret) << "], "
300                       << cudaGetErrorString(ret);
301   }
302   return dev_count;
303 }
304 
SetDevice(int index)305 bool CudaDriver::SetDevice(int index) {
306   auto ret = cudaSetDevice(index);
307   if (ret != cudaSuccess) {
308     MS_LOG(EXCEPTION)
309       << "#umsg#Cuda error:#umsg#SetDevice for id:" << index << " failed, ret[" << static_cast<int>(ret) << "], "
310       << cudaGetErrorString(ret)
311       << ". Please make sure that the 'device_id' set in context is in the range:[0, total number of GPU). "
312          "If the environment variable 'CUDA_VISIBLE_DEVICES' is set, the total number of GPU will be the number set "
313          "in the environment variable 'CUDA_VISIBLE_DEVICES'. For example, if export CUDA_VISIBLE_DEVICES=4,5,6, the "
314          "'device_id' can be 0,1,2 at the moment, 'device_id' starts from 0, and 'device_id'=0 means using GPU of "
315          "number 4.";
316   }
317   int major = 0;
318   int minor = 0;
319   auto curtc_ret = nvrtcVersion(&major, &minor);
320   if (curtc_ret == nvrtcResult::NVRTC_SUCCESS) {
321     MS_LOG(DEBUG) << "NVRTC version is " << major << "." << minor;
322   }
323   return true;
324 }
325 }  // namespace gpu
326 }  // namespace device
327 }  // namespace mindspore
328