1 /**
2 * Copyright 2019 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "plugin/device/gpu/hal/device/gpu_device_manager.h"
18
19 #include <algorithm>
20
21 #include "plugin/device/gpu/hal/device/gpu_common.h"
22 #include "utils/log_adapter.h"
23 #include "include/common/utils/convert_utils.h"
24
25 namespace mindspore {
26 namespace device {
27 namespace gpu {
GetInstance()28 GPUDeviceManager &GPUDeviceManager::GetInstance() {
29 static GPUDeviceManager instance;
30 return instance;
31 }
32
InitDevice()33 void GPUDeviceManager::InitDevice() {
34 CHECK_OP_RET_WITH_EXCEPT(CudaDriver::SetDevice(SizeToInt(cur_dev_id_)), "Failed to set current device id");
35 if (dev_alive_) {
36 return;
37 }
38 CHECK_OP_RET_WITH_EXCEPT(CreateStream(&default_stream_), "Failed to create CUDA stream.");
39 default_stream_id_ = gpu_streams_.size() - 1;
40 CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(cudnnCreate(&cudnn_handle_), "Failed to create cuDNN handle");
41 CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(cudnnSetStream(cudnn_handle_, reinterpret_cast<cudaStream_t>(default_stream())),
42 "Failed to set stream for cuDNN handle.");
43 CHECK_CUBLAS_RET_WITH_EXCEPT_NOTRACE(cublasCreate(&cublas_handle_), "Failed to create cuBLAS handle.");
44 CHECK_CUBLAS_RET_WITH_EXCEPT_NOTRACE(
45 cublasSetStream(cublas_handle_, reinterpret_cast<cudaStream_t>(default_stream())),
46 "Failed to set stream for cuBLAS handle.");
47 CHECK_CUSOLVER_RET_WITH_EXCEPT_NOTRACE(cusolverDnCreate(&cusolver_dn_handle_),
48 "Failed to create cusolver dn handle.");
49 CHECK_CUSOLVER_RET_WITH_EXCEPT_NOTRACE(
50 cusolverDnSetStream(cusolver_dn_handle_, reinterpret_cast<cudaStream_t>(default_stream())),
51 "Failed to set stream for cusolver dn handle");
52 // Create cusparse handle.
53 CHECK_CUSPARSE_RET_WITH_EXCEPT(cusparseCreate(&cusparse_handle_), "Failed to create sparse handle.");
54 CHECK_CUSPARSE_RET_WITH_EXCEPT(cusparseSetStream(cusparse_handle_, reinterpret_cast<cudaStream_t>(default_stream())),
55 "Failed to set stream for cusparse handle");
56
57 CHECK_OP_RET_WITH_EXCEPT(GPUMemoryAllocator::GetInstance().Init(), "Failed to Init gpu memory allocator")
58 dev_alive_ = true;
59 }
60
ReleaseDevice()61 void GPUDeviceManager::ReleaseDevice() {
62 // Avoid repeated release device resource.
63 if (!dev_alive_) {
64 return;
65 }
66 {
67 std::lock_guard<std::mutex> lock_gpu_streams(stream_mutex_);
68 for (CudaDeviceStream stream : gpu_streams_) {
69 if (stream != nullptr) {
70 CHECK_OP_RET_WITH_ERROR(CudaDriver::DestroyStream(stream), "Failed to destroy CUDA stream.");
71 }
72 }
73 gpu_streams_.clear();
74 }
75
76 if (cudnn_handle_ != nullptr) {
77 CHECK_CUDNN_RET_WITH_ERROR_NOTRACE(cudnnDestroy(cudnn_handle_), "Failed to destroy cuDNN handle");
78 }
79 if (cublas_handle_ != nullptr) {
80 CHECK_CUBLAS_RET_WITH_ERROR(cublasDestroy(cublas_handle_), "Failed to destroy cuBLAS handle.");
81 }
82 if (cusolver_dn_handle_ != nullptr) {
83 CHECK_CUSOLVER_RET_WITH_ERROR(cusolverDnDestroy(cusolver_dn_handle_), "Failed to destroy cusolver dn handle.");
84 }
85 if (cusparse_handle_ != nullptr) {
86 CHECK_CUSPARSE_RET_WITH_ERROR(cusparseDestroy(cusparse_handle_), "Failed to destroy cusparse handle.");
87 }
88
89 dev_alive_ = false;
90 }
91
CreateStream(CudaDeviceStream * stream)92 bool GPUDeviceManager::CreateStream(CudaDeviceStream *stream) {
93 std::lock_guard<std::mutex> lock_gpu_streams(stream_mutex_);
94 CHECK_OP_RET_WITH_EXCEPT(CudaDriver::CreateStream(stream), "Failed to create CUDA stream");
95 (void)gpu_streams_.emplace_back(*stream);
96 return true;
97 }
98
CreateStream(size_t * stream_id)99 bool GPUDeviceManager::CreateStream(size_t *stream_id) {
100 MS_EXCEPTION_IF_NULL(stream_id);
101
102 std::lock_guard<std::mutex> lock_gpu_streams(stream_mutex_);
103 CudaDeviceStream stream;
104 CHECK_OP_RET_WITH_EXCEPT(CudaDriver::CreateStream(&stream), "Failed to create CUDA stream");
105 *stream_id = gpu_streams_.size();
106 (void)gpu_streams_.emplace_back(stream);
107 return true;
108 }
109
CreateStreamWithPriority(size_t * stream_id,int32_t priority)110 bool GPUDeviceManager::CreateStreamWithPriority(size_t *stream_id, int32_t priority) {
111 MS_EXCEPTION_IF_NULL(stream_id);
112
113 std::lock_guard<std::mutex> lock_gpu_streams(stream_mutex_);
114 CudaDeviceStream stream;
115 CHECK_OP_RET_WITH_EXCEPT(CudaDriver::CreateStreamWithPriority(&stream, priority),
116 "Failed to create CUDA stream with priority");
117 *stream_id = gpu_streams_.size();
118 (void)gpu_streams_.emplace_back(stream);
119
120 return true;
121 }
122
DestroyStream(size_t stream_id)123 bool GPUDeviceManager::DestroyStream(size_t stream_id) {
124 std::lock_guard<std::mutex> lock_gpu_streams(stream_mutex_);
125 if (stream_id >= gpu_streams_.size()) {
126 MS_LOG(ERROR) << "CUDA stream not found for stream id " << stream_id;
127 return false;
128 }
129 if (gpu_streams_.at(stream_id) == nullptr) {
130 MS_LOG(WARNING) << "CUDA stream hsa been destroyed for stream id " << stream_id;
131 return true;
132 }
133 CHECK_OP_RET_WITH_EXCEPT(CudaDriver::DestroyStream(gpu_streams_.at(stream_id)), "Failed to destroy CUDA stream");
134 gpu_streams_[stream_id] = nullptr;
135 return true;
136 }
137
GetStream(size_t stream_id) const138 CudaDeviceStream GPUDeviceManager::GetStream(size_t stream_id) const {
139 if (stream_id >= gpu_streams_.size()) {
140 MS_LOG(DEBUG) << "Stream for stream id[" << stream_id << "] not found, return nullptr.";
141 return nullptr;
142 }
143 return gpu_streams_[stream_id];
144 }
145
QueryStreamSize() const146 size_t GPUDeviceManager::QueryStreamSize() const {
147 return std::count_if(gpu_streams_.begin(), gpu_streams_.end(),
148 [](CudaDeviceStream stream) { return stream != nullptr; });
149 }
150
GetStreamIds() const151 std::vector<uint32_t> GPUDeviceManager::GetStreamIds() const {
152 std::vector<uint32_t> stream_ids;
153 for (size_t i = 0; i < gpu_streams_.size(); i++) {
154 if (gpu_streams_[i] != nullptr) {
155 (void)stream_ids.emplace_back(static_cast<uint32_t>(i));
156 }
157 }
158 return stream_ids;
159 }
160
set_current_stream(size_t stream_id)161 void GPUDeviceManager::set_current_stream(size_t stream_id) { current_stream_id_ = stream_id; }
162
current_stream() const163 size_t GPUDeviceManager::current_stream() const { return current_stream_id_; }
164
QueryStream(size_t stream_id)165 bool GPUDeviceManager::QueryStream(size_t stream_id) {
166 if (stream_id >= gpu_streams_.size()) {
167 MS_LOG(ERROR) << "CUDA stream not found for stream id " << stream_id;
168 return false;
169 }
170 if (gpu_streams_.at(stream_id) == nullptr) {
171 MS_LOG(WARNING) << "CUDA stream has been destroyed for stream id " << stream_id;
172 return true;
173 }
174 MS_LOG(DEBUG) << "Query completion status of stream id: " << stream_id;
175 return CudaDriver::QueryStream(gpu_streams_.at(stream_id));
176 }
177
default_stream() const178 const CudaDeviceStream &GPUDeviceManager::default_stream() const { return default_stream_; }
179
default_stream_id() const180 size_t GPUDeviceManager::default_stream_id() const { return default_stream_id_; }
181
device_count() const182 int GPUDeviceManager::device_count() const { return CudaDriver::device_count(); }
183
set_cur_device_id(uint32_t device_id)184 bool GPUDeviceManager::set_cur_device_id(uint32_t device_id) {
185 if (!dev_id_init_) {
186 dev_id_init_ = true;
187 cur_dev_id_ = device_id;
188 return true;
189 } else {
190 MS_LOG(ERROR) << "Device already been set.";
191 return false;
192 }
193 }
194
cur_device_id() const195 uint32_t GPUDeviceManager::cur_device_id() const { return cur_dev_id_; }
196
is_device_id_init() const197 bool GPUDeviceManager::is_device_id_init() const { return dev_id_init_; }
198
GetCudnnHandle() const199 const cudnnHandle_t &GPUDeviceManager::GetCudnnHandle() const { return cudnn_handle_; }
200
GetCublasHandle() const201 const cublasHandle_t &GPUDeviceManager::GetCublasHandle() const { return cublas_handle_; }
202
GetCusolverDnHandle() const203 const cusolverDnHandle_t &GPUDeviceManager::GetCusolverDnHandle() const { return cusolver_dn_handle_; }
204
GetCuSparseHandle() const205 const cusparseHandle_t &GPUDeviceManager::GetCuSparseHandle() const { return cusparse_handle_; }
206
SyncStream(size_t stream_id) const207 bool GPUDeviceManager::SyncStream(size_t stream_id) const {
208 if (!dev_alive_) {
209 return false;
210 }
211 auto stream = GetStream(stream_id);
212 if (stream == nullptr) {
213 MS_LOG(EXCEPTION) << "Get CUDA stream for stream id failed.";
214 }
215 return SyncStream(stream);
216 }
217
SyncStream(const CudaDeviceStream & stream) const218 bool GPUDeviceManager::SyncStream(const CudaDeviceStream &stream) const {
219 return dev_alive_ && CudaDriver::SyncStream(stream);
220 }
221
SyncAllStreams() const222 bool GPUDeviceManager::SyncAllStreams() const {
223 if (!dev_alive_) {
224 return false;
225 }
226 for (const auto &stream : gpu_streams_) {
227 if (stream != nullptr && !SyncStream(stream)) {
228 return false;
229 }
230 }
231 return true;
232 }
233
SyncNotDefaultStreams() const234 bool GPUDeviceManager::SyncNotDefaultStreams() const {
235 bool res = true;
236 for (size_t i = 0; i < gpu_streams_.size(); i++) {
237 if (i != default_stream_id_ && !SyncStream(i)) {
238 MS_LOG(ERROR) << "Failed to sync for gpu stream id: " << i;
239 res = false;
240 }
241 }
242 return res;
243 }
244
SyncExceptStreamsInList(const std::set<CudaDeviceStream> & except_streams) const245 bool GPUDeviceManager::SyncExceptStreamsInList(const std::set<CudaDeviceStream> &except_streams) const {
246 bool res = true;
247 for (size_t i = 0; i < gpu_streams_.size(); i++) {
248 if (except_streams.count(gpu_streams_[i]) > 0) {
249 MS_LOG(DEBUG) << "Stream id:" << i << " is been synchronized.";
250 continue;
251 }
252 if (!SyncStream(i)) {
253 MS_LOG(ERROR) << "Failed to sync for gpu stream id: " << i;
254 res = false;
255 }
256 }
257 return res;
258 }
259
CopyDeviceMemToHost(const HostMemPtr & dst,const DeviceMemPtr & src,size_t size) const260 bool GPUDeviceManager::CopyDeviceMemToHost(const HostMemPtr &dst, const DeviceMemPtr &src, size_t size) const {
261 return CudaDriver::CopyDeviceMemToHost(dst, src, size);
262 }
263
CopyHostMemToDevice(const DeviceMemPtr & dst,const void * src,size_t size) const264 bool GPUDeviceManager::CopyHostMemToDevice(const DeviceMemPtr &dst, const void *src, size_t size) const {
265 return CudaDriver::CopyHostMemToDevice(dst, src, size);
266 }
267
CopyHostMemToHost(const HostMemPtr & dst,const void * src,size_t size) const268 bool GPUDeviceManager::CopyHostMemToHost(const HostMemPtr &dst, const void *src, size_t size) const {
269 return CudaDriver::CopyHostMemToHost(dst, src, size);
270 }
271
CopyDeviceMemToHostAsync(const HostMemPtr & dst,const void * src,size_t size,CudaDeviceStream stream) const272 bool GPUDeviceManager::CopyDeviceMemToHostAsync(const HostMemPtr &dst, const void *src, size_t size,
273 CudaDeviceStream stream) const {
274 return CudaDriver::CopyDeviceMemToHostAsync(dst, src, size, stream);
275 }
276
CopyHostMemToDeviceAsync(const DeviceMemPtr & dst,const void * src,size_t size,CudaDeviceStream stream) const277 bool GPUDeviceManager::CopyHostMemToDeviceAsync(const DeviceMemPtr &dst, const void *src, size_t size,
278 CudaDeviceStream stream) const {
279 return CudaDriver::CopyHostMemToDeviceAsync(dst, src, size, stream);
280 }
281
CopyDeviceMemToDeviceAsync(const DeviceMemPtr & dst,const void * src,size_t size,CudaDeviceStream stream) const282 bool GPUDeviceManager::CopyDeviceMemToDeviceAsync(const DeviceMemPtr &dst, const void *src, size_t size,
283 CudaDeviceStream stream) const {
284 return CudaDriver::CopyDeviceMemToDeviceAsync(dst, src, size, stream);
285 }
286 } // namespace gpu
287 } // namespace device
288 } // namespace mindspore
289