1 /**
2 * Copyright 2019 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "src/runtime/gpu/opencl/opencl_runtime.h"
18 #include <dlfcn.h>
19 #ifdef SHARING_MEM_WITH_OPENGL
20 #include <EGL/egl.h>
21 #endif
22 #include <vector>
23 #include <numeric>
24 #include <utility>
25 #include "include/errorcode.h"
26 #include "src/runtime/kernel/opencl/utils.h"
27 #include "src/runtime/gpu/opencl/opencl_allocator.h"
28 #include "src/common/file_utils.h"
29
30 using mindspore::kernel::CLErrorCode;
31
32 namespace mindspore::lite::opencl {
33 static std::map<std::string, std::string> g_source_map;
34 static std::mutex g_mtx;
35 static std::mutex g_init_mtx;
36
37 InitState OpenCLRuntime::init_state_ = UnInit;
38 OpenCLRuntime *OpenCLRuntime::ocl_runtime_instance_ = nullptr;
39 size_t OpenCLRuntime::instance_count_ = 0;
40
GetInstance()41 OpenCLRuntime *OpenCLRuntime::GetInstance() {
42 std::unique_lock<std::mutex> lck(g_mtx);
43 static OpenCLRuntime ocl_runtime;
44 if (instance_count_ == 0) {
45 ocl_runtime_instance_ = &ocl_runtime;
46 ocl_runtime_instance_->Init();
47 }
48 instance_count_++;
49 return ocl_runtime_instance_;
50 }
51
DeleteInstance()52 void OpenCLRuntime::DeleteInstance() {
53 std::unique_lock<std::mutex> lck(g_mtx);
54 if (instance_count_ == 0) {
55 MS_LOG(ERROR) << "No OpenCLRuntime instance could delete!";
56 return;
57 }
58 instance_count_--;
59 if (instance_count_ == 0) {
60 ocl_runtime_instance_->Uninit();
61 }
62 }
63
printf_callback(const char * buffer,size_t length,size_t final,void * user_data)64 void printf_callback(const char *buffer, size_t length, size_t final, void *user_data) {
65 fwrite(buffer, 1, length, stdout);
66 }
67
InitGPUDevice(std::vector<cl::Platform> * platforms)68 int OpenCLRuntime::InitGPUDevice(std::vector<cl::Platform> *platforms) {
69 MS_ASSERT(platforms);
70 // search GPU
71 std::vector<cl::Device> devices;
72 int ret = RET_OK;
73 for (auto &platform : *platforms) {
74 std::string platform_name;
75 ret = platform.getInfo(CL_PLATFORM_NAME, &platform_name);
76 if (ret != CL_SUCCESS) {
77 MS_LOG(WARNING) << CLErrorCode(ret);
78 }
79 ret = platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
80 if (ret != CL_SUCCESS) {
81 MS_LOG(WARNING) << CLErrorCode(ret);
82 }
83 MS_LOG(INFO) << "Platform (" << platform_name << ") has " << devices.size() << " GPUs";
84
85 if (!devices.empty()) {
86 std::string device_name = devices[0].getInfo<CL_DEVICE_NAME>();
87 MS_LOG(INFO) << "Find GPU: " << device_name.c_str();
88 cl::Platform::setDefault(platform);
89 break;
90 }
91 }
92
93 // not found, return error code.
94 if (devices.empty()) {
95 MS_LOG(ERROR) << "OpenCL Device not found!";
96 return RET_ERROR;
97 }
98
99 device_ = new (std::nothrow) cl::Device();
100 if (device_ == nullptr) {
101 MS_LOG(ERROR) << "Create OpenCL device failed!";
102 return RET_ERROR;
103 }
104 *device_ = devices[0];
105 max_work_item_sizes_ = device_->getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>();
106 max_work_group_size_ = max_work_item_sizes_[0];
107 const std::string device_name = device_->getInfo<CL_DEVICE_NAME>();
108 const std::string device_version = device_->getInfo<CL_DEVICE_VERSION>();
109 const std::string opencl_version = device_->getInfo<CL_DEVICE_OPENCL_C_VERSION>();
110 clGetDeviceInfo((*device_)(), CL_DEVICE_IMAGE_PITCH_ALIGNMENT, sizeof(cl_uint), &image_pitch_align_, nullptr);
111 MS_LOG(INFO) << "Device name:\t" << device_name;
112 MS_LOG(INFO) << "Opencl version:\t" << device_version;
113 MS_LOG(INFO) << "Image pitch alignment:\t" << image_pitch_align_;
114 MS_LOG(INFO) << "Highest OpenCL c version:\t" << opencl_version;
115 MS_LOG(INFO) << "Max work item size:\t" << max_work_item_sizes_[0] << " : " << max_work_item_sizes_[1] << " : "
116 << max_work_item_sizes_[2];
117
118 gpu_info_ = ParseGpuInfo(device_name, device_version);
119 // get cache size, compute units and frequency.
120 ret = device_->getInfo(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, &global_memery_cachesize_);
121 if (ret != CL_SUCCESS) {
122 MS_LOG(WARNING) << CLErrorCode(ret);
123 }
124 ret = device_->getInfo(CL_DEVICE_MAX_COMPUTE_UNITS, &compute_units_);
125 if (ret != CL_SUCCESS) {
126 MS_LOG(WARNING) << CLErrorCode(ret);
127 }
128 ret = device_->getInfo(CL_DEVICE_MAX_CLOCK_FREQUENCY, &max_freq_);
129 if (ret != CL_SUCCESS) {
130 MS_LOG(WARNING) << CLErrorCode(ret);
131 }
132 cl_device_fp_config fp_config;
133 auto success = device_->getInfo(CL_DEVICE_HALF_FP_CONFIG, &fp_config);
134 support_fp16_ = CL_SUCCESS == success && fp_config > 0;
135
136 ret = device_->getInfo(CL_DEVICE_SVM_CAPABILITIES, &svm_capabilities_);
137 if (ret != CL_SUCCESS || svm_capabilities_ == 0) {
138 svm_capabilities_ = 0;
139 MS_LOG(INFO) << "SVM capalibilties: "
140 << "NONE";
141 } else {
142 if (svm_capabilities_ & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) {
143 MS_LOG(INFO) << "SVM capalibilties: "
144 << "SVM_FINE_GRAIN_BUFFER";
145 }
146 if (svm_capabilities_ & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) {
147 MS_LOG(INFO) << "SVM capalibilties: "
148 << "SVM_COARSE_GRAIN_BUFFER";
149 }
150 if (svm_capabilities_ & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) {
151 MS_LOG(INFO) << "SVM capalibilties: "
152 << "SVM_COARSE_GRAIN_SYSTEM";
153 }
154 if (svm_capabilities_ & CL_DEVICE_SVM_ATOMICS) {
155 MS_LOG(INFO) << "SVM capalibilties: "
156 << "SVM_ATOMICS";
157 }
158 }
159 global_memery_size_ = device_->getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>();
160 max_alloc_size_ = device_->getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>();
161 max_image2d_width_ = device_->getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
162 max_image2d_height_ = device_->getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
163 supported_extensions_ = std::string(device_->getInfo<CL_DEVICE_EXTENSIONS>());
164 cache_line_size_ = device_->getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
165 MS_LOG(INFO) << "Address space bits: " << device_->getInfo<CL_DEVICE_ADDRESS_BITS>();
166 MS_LOG(INFO) << "Global Mem Size: " << global_memery_size_;
167 MS_LOG(INFO) << "Global Mem Cache Size: " << global_memery_cachesize_;
168 MS_LOG(INFO) << "Max Alloc Size: " << max_alloc_size_;
169 MS_LOG(INFO) << "Compute Unit: " << compute_units_;
170 MS_LOG(INFO) << "Clock Frequency: " << max_freq_ << " MHz";
171 return RET_OK;
172 }
173
InitQueue(std::vector<cl::Platform> * platforms)174 int OpenCLRuntime::InitQueue(std::vector<cl::Platform> *platforms) {
175 MS_ASSERT(platforms);
176 cl_int ret;
177 #if defined(SHARING_MEM_WITH_OPENGL) && defined(CL_HPP_TARGET_OPENCL_VERSION) && (CL_HPP_TARGET_OPENCL_VERSION >= 120)
178 // create context from glcontext
179 MS_LOG(INFO) << "Create special opencl context to share with OpenGL";
180 cl_context_properties context_prop[] = {CL_GL_CONTEXT_KHR, (cl_context_properties)eglGetCurrentContext(),
181 CL_EGL_DISPLAY_KHR, (cl_context_properties)eglGetCurrentDisplay(), 0};
182 context_ = new (std::nothrow) cl::Context(std::vector<cl::Device>{*device_}, context_prop, nullptr, nullptr, &ret);
183 if (context_ == nullptr || ret != CL_SUCCESS) {
184 MS_LOG(ERROR) << "Create special OpenCL context failed, Create common OpenCL context then.";
185 if (context_ != nullptr) {
186 delete context_;
187 }
188 context_ = new (std::nothrow) cl::Context(std::vector<cl::Device>{*device_}, nullptr, nullptr, nullptr, &ret);
189 if (context_ == nullptr) {
190 delete device_;
191 MS_LOG(ERROR) << "Create OpenCL context failed!";
192 return RET_ERROR;
193 }
194 }
195 #else
196 MS_LOG(INFO) << "Create common opencl context";
197 #ifdef Debug
198 std::vector<cl_context_properties> ctx_properties = {CL_CONTEXT_PLATFORM,
199 (cl_context_properties)(*platforms)[0](),
200 CL_PRINTF_CALLBACK_ARM,
201 (cl_context_properties)printf_callback,
202 CL_PRINTF_BUFFERSIZE_ARM,
203 0x1000000,
204 0};
205 context_ =
206 new (std::nothrow) cl::Context(std::vector<cl::Device>{*device_}, ctx_properties.data(), nullptr, nullptr, &ret);
207 if (context_ == nullptr || ret != CL_SUCCESS) {
208 context_ = new (std::nothrow) cl::Context(std::vector<cl::Device>{*device_}, nullptr, nullptr, nullptr, &ret);
209 }
210 #else
211 context_ = new (std::nothrow) cl::Context(std::vector<cl::Device>{*device_}, nullptr, nullptr, nullptr, &ret);
212 #endif
213 #endif
214 if (context_ == nullptr || ret != CL_SUCCESS) {
215 delete device_;
216 device_ = nullptr;
217 MS_LOG(ERROR) << "Context create failed: " << CLErrorCode(ret);
218 return RET_ERROR;
219 }
220
221 default_command_queue_ = new (std::nothrow) cl::CommandQueue(*context_, *device_, 0, &ret);
222 if (default_command_queue_ == nullptr || ret != CL_SUCCESS) {
223 delete device_;
224 delete context_;
225 device_ = nullptr;
226 context_ = nullptr;
227 MS_LOG(ERROR) << "Command Queue create failed: " << CLErrorCode(ret);
228 return RET_ERROR;
229 }
230
231 profiling_command_queue_ = new (std::nothrow) cl::CommandQueue(*context_, *device_, CL_QUEUE_PROFILING_ENABLE, &ret);
232 if (profiling_command_queue_ == nullptr || ret != CL_SUCCESS) {
233 delete device_;
234 delete context_;
235 delete default_command_queue_;
236 device_ = nullptr;
237 context_ = nullptr;
238 default_command_queue_ = nullptr;
239 MS_LOG(ERROR) << "Profiling command Queue create failed: " << CLErrorCode(ret);
240 return RET_ERROR;
241 }
242 return RET_OK;
243 }
244
245 // Init will get platforms info, get devices info, create opencl context.
Init()246 int OpenCLRuntime::Init() {
247 std::unique_lock<std::mutex> lck(g_init_mtx);
248 if (init_state_ == InitSuccess) {
249 return RET_OK;
250 } else if (init_state_ == InitFailed) {
251 return RET_ERROR;
252 }
253 init_state_ = InitFailed;
254
255 MS_LOG(INFO) << "OpenCL version: CL_TARGET_OPENCL_VERSION " << CL_TARGET_OPENCL_VERSION;
256 MS_LOG(INFO) << "CL_HPP_TARGET_OPENCL_VERSION " << CL_HPP_TARGET_OPENCL_VERSION;
257 MS_LOG(INFO) << "CL_HPP_MINIMUM_OPENCL_VERSION " << CL_HPP_MINIMUM_OPENCL_VERSION;
258
259 #ifdef USE_OPENCL_WRAPPER
260 if (!lite::opencl::LoadOpenCLLibrary(&handle_)) {
261 MS_LOG(ERROR) << "Load OpenCL symbols failed!";
262 return RET_ERROR;
263 }
264 #endif // USE_OPENCL_WRAPPER
265 std::vector<cl::Platform> platforms;
266 cl_int ret = cl::Platform::get(&platforms);
267 if (platforms.empty()) {
268 MS_LOG(ERROR) << "OpenCL Platform not found!" << CLErrorCode(ret);
269 return RET_ERROR;
270 }
271 auto ms_ret = InitGPUDevice(&platforms);
272 if (ms_ret != RET_OK) {
273 return ms_ret;
274 }
275
276 // only support mali device.
277 if (gpu_info_.type == MALI || gpu_info_.type == MALI_T || gpu_info_.type == MALI_G) {
278 clImportMemoryARM = reinterpret_cast<clImportMemoryARMFunc>(dlsym(handle_, "clImportMemoryARM"));
279 if (clImportMemoryARM == nullptr) {
280 MS_LOG(ERROR) << "load func (clImportMemoryARM) failed!";
281 UnLoadOpenCLLibrary(handle_);
282 return false;
283 }
284 }
285
286 ms_ret = InitQueue(&platforms);
287 if (ms_ret != RET_OK) {
288 return ms_ret;
289 }
290
291 allocator_ = std::make_shared<OpenCLAllocator>(this);
292 if (allocator_ == nullptr) {
293 delete device_;
294 delete context_;
295 delete default_command_queue_;
296 delete profiling_command_queue_;
297 device_ = nullptr;
298 context_ = nullptr;
299 default_command_queue_ = nullptr;
300 profiling_command_queue_ = nullptr;
301 MS_LOG(ERROR) << "Command OpenCL allocator failed!";
302 return RET_ERROR;
303 }
304 LoadCache();
305 init_state_ = InitSuccess;
306 MS_LOG(INFO) << "OpenCLRuntime init done!";
307 return RET_OK;
308 }
309
Uninit()310 int OpenCLRuntime::Uninit() {
311 std::unique_lock<std::mutex> lck(g_init_mtx);
312 if (init_state_ != InitSuccess) {
313 return RET_OK;
314 }
315 if (StoreCache() != RET_OK) {
316 MS_LOG(ERROR) << "StoreCache failed!";
317 }
318 program_map_.clear();
319 delete default_command_queue_;
320 delete profiling_command_queue_;
321 delete context_;
322 delete device_;
323 allocator_ = nullptr;
324 default_command_queue_ = nullptr;
325 profiling_command_queue_ = nullptr;
326 context_ = nullptr;
327 device_ = nullptr;
328 init_state_ = UnInit;
329 return RET_OK;
330 }
331
~OpenCLRuntime()332 OpenCLRuntime::~OpenCLRuntime() { Uninit(); }
333
Context()334 cl::Context *OpenCLRuntime::Context() { return context_; }
335
Device()336 cl::Device *OpenCLRuntime::Device() { return device_; }
337
DeviceGlobalMemoryCacheSize() const338 uint64_t OpenCLRuntime::DeviceGlobalMemoryCacheSize() const { return global_memery_cachesize_; }
339
DeviceMaxWorkGroupSize() const340 uint64_t OpenCLRuntime::DeviceMaxWorkGroupSize() const { return max_work_group_size_; }
341
DeviceComputeUnits() const342 uint32_t OpenCLRuntime::DeviceComputeUnits() const { return compute_units_; }
343
DeviceMaxFreq() const344 uint32_t OpenCLRuntime::DeviceMaxFreq() const { return max_freq_; }
345
346 // get kernel enqueue max work group size
GetMaxWorkGroupSize(const cl::Kernel & kernel)347 uint64_t OpenCLRuntime::GetMaxWorkGroupSize(const cl::Kernel &kernel) {
348 uint64_t max_workgroup_size = 0;
349 int ret = kernel.getWorkGroupInfo(*device_, CL_KERNEL_WORK_GROUP_SIZE, &max_workgroup_size);
350 if (ret != CL_SUCCESS) {
351 max_workgroup_size = 0;
352 }
353 return max_workgroup_size;
354 }
355
356 // opencl 2.0 can get SubGroupSize.
GetSubGroupSize(const cl::Kernel & kernel,const cl::NDRange & range)357 uint32_t OpenCLRuntime::GetSubGroupSize(const cl::Kernel &kernel, const cl::NDRange &range) {
358 uint32_t sub_group_size = 0;
359
360 if (ADRENO == gpu_info_.type) {
361 #if defined(CL_HPP_TARGET_OPENCL_VERSION) && CL_HPP_TARGET_OPENCL_VERSION >= 200 && \
362 defined(CL_TARGET_OPENCL_VERSION) && CL_TARGET_OPENCL_VERSION >= 210 && defined(CL_HPP_USE_CL_SUB_GROUPS_KHR)
363 cl_int cl_ret;
364 sub_group_size = kernel.getSubGroupInfo<CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE>(*device_, range, &cl_ret);
365 if (cl_ret != CL_SUCCESS) {
366 CHECK_CL_SUCCESS(cl_ret)
367 sub_group_size = 0;
368 }
369 #else
370 sub_group_size = 0;
371 #endif
372 }
373
374 return sub_group_size;
375 }
376
GetGpuInfo()377 GpuInfo OpenCLRuntime::GetGpuInfo() { return gpu_info_; }
378
GetFp16Enable() const379 bool OpenCLRuntime::GetFp16Enable() const { return fp16_enable_; }
380
381 // if support fp16, set fp16 will success.
SetFp16Enable(bool enable)382 bool OpenCLRuntime::SetFp16Enable(bool enable) {
383 fp16_enable_ = enable && support_fp16_;
384 return fp16_enable_ == enable;
385 }
386
BuildKernel(const cl::Kernel & kernel,const std::string & program_name,const std::string & kernel_name,const std::vector<std::string> & build_options_ext,const bool is_builtin)387 int OpenCLRuntime::BuildKernel(const cl::Kernel &kernel, const std::string &program_name,
388 const std::string &kernel_name, const std::vector<std::string> &build_options_ext,
389 const bool is_builtin) {
390 std::string build_option;
391 if (is_builtin) {
392 build_option = default_build_option_;
393 if (fp16_enable_) {
394 build_option +=
395 " -DFP16_ENABLE=1 -DFLT=half -DFLT4=half4 -DFLT16=half16 -DAS_FLT4=as_half4 -DAS_UINT4=as_ushort4 "
396 "-DUINT4=ushort4"
397 " -DTO_FLT=convert_half -DTO_FLT4=convert_half4";
398 } else {
399 build_option +=
400 " -DFP16_ENABLE=0 -DFLT=float -DFLT4=float4 -DFLT16=float16 -DAS_FLT4=as_float4 -DAS_UINT4=as_uint4 "
401 "-DUINT4=uint4"
402 " -DTO_FLT=convert_float -DTO_FLT4=convert_float4";
403 }
404 build_option += " -DMAX_IMAGE2D_WIDTH=" + std::to_string(max_image2d_width_);
405 }
406 build_option =
407 std::accumulate(build_options_ext.begin(), build_options_ext.end(), build_option,
408 [](const std::string &options, const std::string &option) { return options + " " + option; });
409
410 cl::Program program;
411 auto program_key = std::make_pair(program_name, build_option);
412 auto iter = program_map_.find(program_key);
413 if (iter != program_map_.end()) {
414 program = iter->second;
415 } else {
416 flush_cache_ = true;
417 auto status = this->LoadProgram(program_name, &program);
418 if (!status) {
419 MS_LOG(ERROR) << "load program (" << program_name << ") failed!";
420 return RET_ERROR;
421 }
422 status = this->BuildProgram(build_option, program);
423 if (!status) {
424 MS_LOG(ERROR) << program_name << " build failed!";
425 return RET_ERROR;
426 }
427 program_map_.emplace(program_key, program);
428 }
429
430 cl_int ret;
431 const_cast<cl::Kernel &>(kernel) = cl::Kernel(program, kernel_name.c_str(), &ret);
432 if (ret != CL_SUCCESS) {
433 MS_LOG(ERROR) << kernel_name << " Kernel create failed:" << CLErrorCode(ret);
434 return RET_ERROR;
435 }
436 return RET_OK;
437 }
438
439 // Run Kernel with 1D, 2D, 3D group size, and local size can be empty.
RunKernel(const cl::Kernel & kernel,const cl::NDRange & global,const cl::NDRange & local,cl::CommandQueue * command_queue,cl::Event * event)440 int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const cl::NDRange &global, const cl::NDRange &local,
441 cl::CommandQueue *command_queue, cl::Event *event) {
442 if (command_queue == nullptr) {
443 if (profiling_) {
444 command_queue = profiling_command_queue_;
445 } else {
446 command_queue = default_command_queue_;
447 }
448 }
449 MS_ASSERT(local.size() == 0 || local.size() == global.size());
450 cl_int ret = CL_SUCCESS;
451 ret = command_queue->enqueueNDRangeKernel(kernel, cl::NullRange, global, local, nullptr, event);
452 if (ret != CL_SUCCESS) {
453 MS_LOG(ERROR) << "Kernel execute failed:" << CLErrorCode(ret);
454 return RET_ERROR;
455 }
456 static int cnt = 0;
457 const int flush_period = 10;
458 if (cnt % flush_period == 0) {
459 auto flush_ret = command_queue->flush();
460 if (flush_ret != CL_SUCCESS) {
461 MS_LOG(WARNING) << "CL Flush failed:" << CLErrorCode(ret);
462 }
463 }
464 cnt++;
465 MS_LOG(DEBUG) << "RunKernel success!";
466 if (profiling_) {
467 event->wait();
468 }
469 return RET_OK;
470 }
471
472 // get gpu divce type
ParseGpuInfo(std::string device_name,std::string device_version)473 GpuInfo OpenCLRuntime::ParseGpuInfo(std::string device_name, std::string device_version) {
474 GpuInfo info;
475 if (device_name == "QUALCOMM Adreno(TM)") {
476 info.type = ADRENO;
477 } else if (device_name.find("Mali") != std::string::npos) {
478 info.type = MALI;
479 // Mali type MALI-G or MALI_T
480 if (device_name.find("Mali-G") != std::string::npos) {
481 info.type = MALI_G;
482 } else if (device_name.find("Mali-T") != std::string::npos) {
483 info.type = MALI_T;
484 }
485 }
486 return info;
487 }
488
LoadSource(const std::string & program_name,const std::string & source)489 bool OpenCLRuntime::LoadSource(const std::string &program_name, const std::string &source) {
490 auto it_source = g_source_map.find(program_name);
491 if (it_source == g_source_map.end()) {
492 g_source_map.emplace(program_name, source);
493 }
494 return true;
495 }
496
497 // load program with program name.
LoadProgram(const std::string & program_name,cl::Program * program)498 bool OpenCLRuntime::LoadProgram(const std::string &program_name, cl::Program *program) {
499 auto it_source = g_source_map.find(program_name);
500 if (it_source != g_source_map.end()) {
501 cl::Program::Sources sources;
502 sources.push_back(it_source->second);
503 *program = cl::Program(*context_, sources);
504 return true;
505 } else {
506 MS_LOG(ERROR) << "Can't find kernel source !";
507 return false;
508 }
509 }
510
511 // build program with build options
BuildProgram(const std::string & build_option,const cl::Program & program)512 bool OpenCLRuntime::BuildProgram(const std::string &build_option, const cl::Program &program) {
513 cl_int ret = program.build({*device_}, build_option.c_str());
514 if (ret != CL_SUCCESS) {
515 if (program.getBuildInfo<CL_PROGRAM_BUILD_STATUS>(*device_) == CL_BUILD_ERROR) {
516 std::string build_log = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(*device_);
517 MS_LOG(ERROR) << "Program build log: " << build_log;
518 }
519 MS_LOG(ERROR) << "Build program failed: " << CLErrorCode(ret);
520 return false;
521 }
522 return true;
523 }
524
ReadOrWriteImage(void * buffer,void * data,bool is_read)525 int OpenCLRuntime::ReadOrWriteImage(void *buffer, void *data, bool is_read) {
526 cl::CommandQueue *command_queue = profiling_ ? profiling_command_queue_ : default_command_queue_;
527 auto *image = allocator_->GetImage(buffer);
528 if (image == nullptr) {
529 MS_LOG(WARNING) << "Can't get Image2D for " << buffer;
530 return RET_ERROR;
531 }
532 ImageSize img_size;
533 int ret = allocator_->GetImageSize(buffer, &img_size);
534 if (ret != RET_OK) {
535 MS_LOG(WARNING) << "Can't get GetImageSize for " << buffer;
536 return RET_ERROR;
537 }
538 cl::array<size_t, 3> origin = {0, 0, 0};
539 cl::array<size_t, 3> region = {img_size.width, img_size.height, 1};
540 if (is_read) {
541 ret = command_queue->enqueueReadImage(*image, true, origin, region, 0, 0, data, nullptr, nullptr);
542 } else {
543 ret = command_queue->enqueueWriteImage(*image, true, origin, region, 0, 0, data, nullptr, nullptr);
544 }
545 if (ret != CL_SUCCESS) {
546 MS_LOG(ERROR) << CLErrorCode(ret);
547 return RET_ERROR;
548 }
549 return RET_OK;
550 }
551
ReadImage(void * buffer,void * dst_data)552 int OpenCLRuntime::ReadImage(void *buffer, void *dst_data) { return ReadOrWriteImage(buffer, dst_data, true); }
553
WriteImage(void * buffer,void * src_data)554 int OpenCLRuntime::WriteImage(void *buffer, void *src_data) { return ReadOrWriteImage(buffer, src_data, false); }
555
CopyDeviceMemToHost(void * dst,const void * src,size_t size,cl::CommandQueue * command_queue,bool sync) const556 bool OpenCLRuntime::CopyDeviceMemToHost(void *dst, const void *src, size_t size, cl::CommandQueue *command_queue,
557 bool sync) const {
558 if (command_queue == nullptr) {
559 command_queue = default_command_queue_;
560 }
561 cl_int cl_ret = CL_SUCCESS;
562 const cl::Buffer *buffer = static_cast<const cl::Buffer *>(src);
563 if (command_queue != nullptr) {
564 cl_ret = command_queue->enqueueReadBuffer(*buffer, sync, 0, size, dst);
565 }
566 return cl_ret == CL_SUCCESS;
567 }
568
CopyHostMemToDevice(const void * dst,const void * src,size_t size,cl::CommandQueue * command_queue,bool sync) const569 bool OpenCLRuntime::CopyHostMemToDevice(const void *dst, const void *src, size_t size, cl::CommandQueue *command_queue,
570 bool sync) const {
571 if (command_queue == nullptr) {
572 command_queue = default_command_queue_;
573 }
574 cl_int cl_ret = CL_SUCCESS;
575 const cl::Buffer *buffer = static_cast<const cl::Buffer *>(dst);
576 if (command_queue != nullptr) {
577 cl_ret = command_queue->enqueueWriteBuffer(*buffer, sync, 0, size, src);
578 }
579 return cl_ret == CL_SUCCESS;
580 }
581
MapBuffer(const cl::Buffer & buffer,int flags,size_t size,cl::CommandQueue * command_queue,bool sync) const582 void *OpenCLRuntime::MapBuffer(const cl::Buffer &buffer, int flags, size_t size, cl::CommandQueue *command_queue,
583 bool sync) const {
584 if (command_queue == nullptr) {
585 command_queue = default_command_queue_;
586 }
587 return command_queue->enqueueMapBuffer(buffer, sync, flags, 0, size);
588 }
589
MapBuffer(void * host_ptr,int flags,size_t size,cl::CommandQueue * command_queue,bool sync) const590 int OpenCLRuntime::MapBuffer(void *host_ptr, int flags, size_t size, cl::CommandQueue *command_queue, bool sync) const {
591 if (GetSVMCapabilities() & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) {
592 return RET_ERROR;
593 }
594 if (command_queue == nullptr) {
595 command_queue = default_command_queue_;
596 }
597 if (clEnqueueSVMMap(command_queue->get(), sync, flags, host_ptr, size, 0, nullptr, nullptr) != CL_SUCCESS) {
598 return RET_ERROR;
599 }
600 return RET_OK;
601 }
602
MapBuffer(const cl::Image2D & buffer,bool sync,int flags,const std::vector<size_t> & region,cl::CommandQueue * command_queue) const603 void *OpenCLRuntime::MapBuffer(const cl::Image2D &buffer, bool sync, int flags, const std::vector<size_t> ®ion,
604 cl::CommandQueue *command_queue) const {
605 if (command_queue == nullptr) {
606 command_queue = default_command_queue_;
607 }
608 cl::size_type row_pitch;
609 cl::size_type slice_pitch;
610 cl::array<cl::size_type, 3> origin_{0, 0, 0};
611 cl::array<cl::size_type, 3> region_{region[0], region[1], region[2]};
612 return command_queue->enqueueMapImage(buffer, sync, flags, origin_, region_, &row_pitch, &slice_pitch);
613 }
614
UnmapBuffer(const cl::Memory & buffer,void * host_ptr,cl::CommandQueue * command_queue) const615 int OpenCLRuntime::UnmapBuffer(const cl::Memory &buffer, void *host_ptr, cl::CommandQueue *command_queue) const {
616 if (command_queue == nullptr) {
617 command_queue = default_command_queue_;
618 }
619 return command_queue->enqueueUnmapMemObject(buffer, host_ptr);
620 }
621
UnmapBuffer(void * host_ptr,cl::CommandQueue * command_queue) const622 int OpenCLRuntime::UnmapBuffer(void *host_ptr, cl::CommandQueue *command_queue) const {
623 if (GetSVMCapabilities() & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) {
624 return RET_OK;
625 }
626 if (command_queue == nullptr) {
627 command_queue = default_command_queue_;
628 }
629 return clEnqueueSVMUnmap(command_queue->get(), host_ptr, 0, nullptr, nullptr);
630 }
631
SyncCommandQueue(cl::CommandQueue * command_queue)632 bool OpenCLRuntime::SyncCommandQueue(cl::CommandQueue *command_queue) {
633 if (command_queue == nullptr) {
634 command_queue = default_command_queue_;
635 }
636 cl_int ret = command_queue->finish();
637 if (ret != CL_SUCCESS) {
638 MS_LOG(ERROR) << "Command queue sync failed: " << CLErrorCode(ret);
639 return RET_ERROR;
640 }
641 return ret == CL_SUCCESS;
642 }
643
GetKernelMaxWorkGroupSize(cl_kernel kernel,cl_device_id device_id)644 int OpenCLRuntime::GetKernelMaxWorkGroupSize(cl_kernel kernel, cl_device_id device_id) {
645 size_t max_work_group_size;
646 cl_int ret = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t),
647 &max_work_group_size, nullptr);
648 if (ret != CL_SUCCESS) {
649 MS_LOG(ERROR) << "Failed to get info CL_KERNEL_WORK_GROUP_SIZE " << CLErrorCode(ret);
650 }
651 return static_cast<int>(max_work_group_size);
652 }
653
GetKernelFromBinary(const std::string & kernel_name)654 cl::Kernel OpenCLRuntime::GetKernelFromBinary(const std::string &kernel_name) {
655 cl_int ret = CL_SUCCESS;
656 cl::Kernel kernel = cl::Kernel(binary_program_, kernel_name.c_str(), &ret);
657 if (ret != CL_SUCCESS) {
658 MS_LOG(ERROR) << "Create kernel with binary program failed: " << CLErrorCode(ret);
659 }
660 return kernel;
661 }
662
663 // build program with IL
CreateProgramFromIL(const std::vector<char> & binary,const std::string & flag)664 cl::Program OpenCLRuntime::CreateProgramFromIL(const std::vector<char> &binary, const std::string &flag) {
665 #if defined(CL_HPP_TARGET_OPENCL_VERSION) && CL_HPP_TARGET_OPENCL_VERSION >= 210
666 cl::Program program = cl::Program(*context_, binary);
667 bool status = BuildProgram(default_build_opts_, program);
668 if (!status) {
669 MS_LOG(ERROR) << "Build program with IL failed!";
670 }
671 return program;
672 #else
673 MS_LOG(ERROR) << "Create program with IL failed! The compute capabitity of device should be 2.1 and higher.";
674 return cl::Program();
675 #endif
676 }
677
678 // build program with binary
CreateProgramFromBinary(const std::vector<unsigned char> & binary,const std::string & build_option)679 cl::Program OpenCLRuntime::CreateProgramFromBinary(const std::vector<unsigned char> &binary,
680 const std::string &build_option) {
681 cl::Program program = cl::Program(*context_, {*device_}, {binary});
682 bool status = BuildProgram(build_option, program);
683 if (!status) {
684 MS_LOG(ERROR) << "Build program with binary failed!";
685 }
686 return program;
687 }
688
GetProgramBinary(const cl::Program & program)689 std::vector<unsigned char> OpenCLRuntime::GetProgramBinary(const cl::Program &program) {
690 cl_int ret = CL_SUCCESS;
691 auto binarys = program.getInfo<CL_PROGRAM_BINARIES>(&ret);
692 if (ret != CL_SUCCESS) {
693 MS_LOG(ERROR) << "Get program binary failed: " << CLErrorCode(ret);
694 }
695 if (binarys.empty()) {
696 MS_LOG(ERROR) << "binarys is empty";
697 return {};
698 }
699 return binarys.front();
700 }
701
LoadCache()702 void OpenCLRuntime::LoadCache() {
703 if (!enable_cache_) {
704 return;
705 }
706 size_t len;
707 std::unique_ptr<char[]> buf(lite::ReadFile(cache_path_.c_str(), &len));
708 if (buf == nullptr) {
709 MS_LOG(ERROR) << "Load opencl cache fail: buf == nullptr";
710 return;
711 }
712 auto gpu_cache = schema::GetGpuCache(buf.get());
713 if (gpu_cache == nullptr) {
714 MS_LOG(ERROR) << "Load opencl cache fail: gpu_cache == nullptr";
715 return;
716 }
717 auto *bins = gpu_cache->allBins();
718 if (bins == nullptr) {
719 MS_LOG(ERROR) << "Load opencl cache fail: bins == nullptr";
720 return;
721 }
722 for (auto i = 0; i < bins->size(); ++i) {
723 auto *bin = bins->template GetAs<schema::ProgramBinary>(i);
724 if (bin == nullptr) {
725 MS_LOG(ERROR) << "kernel_bin[" << i << "] null";
726 return;
727 }
728 auto *pdata = bin->data();
729 MS_ASSERT(pdata);
730 if (pdata->size() == 0) {
731 continue;
732 }
733 std::vector<unsigned char> binary(pdata->begin(), pdata->end());
734 auto program = CreateProgramFromBinary(binary, bin->build_option()->str());
735 program_map_.emplace(std::make_pair(bin->program_name()->str(), bin->build_option()->str()), program);
736 MS_LOG(INFO) << "LoadCache " << bin->program_name() << " success, size=" << binary.size();
737 }
738 MS_LOG(INFO) << "Init opencl cache success";
739 }
740
StoreCache()741 int OpenCLRuntime::StoreCache() {
742 if (!enable_cache_) {
743 return RET_OK;
744 }
745 if (!flush_cache_) {
746 return RET_OK;
747 }
748 auto fbb = std::make_unique<flatbuffers::FlatBufferBuilder>();
749 if (fbb == nullptr) {
750 MS_LOG(ERROR) << "new opencl FlatBufferBuilder fail";
751 return RET_ERROR;
752 }
753 std::vector<flatbuffers::Offset<schema::ProgramBinary>> program_binarys;
754 for (const auto &kv : program_map_) {
755 auto program_name = kv.first.first;
756 auto build_option = kv.first.second;
757 cl::Program program = kv.second;
758 auto binary = this->GetProgramBinary(program);
759 std::vector<int32_t> shape;
760 auto tune = schema::CreateTuneParam(*fbb, fbb->CreateVector<int32_t>(shape), fbb->CreateVector<int32_t>(shape),
761 fbb->CreateVector<int32_t>(shape), fbb->CreateVector<int32_t>(shape));
762 auto program_binary = schema::CreateProgramBinary(
763 *fbb, fbb->CreateString(program_name), fbb->CreateString(build_option), tune, fbb->CreateVector<uint8_t>(binary));
764 program_binarys.emplace_back(program_binary);
765 MS_LOG(INFO) << "StoreCache " << program_name << " success, size=" << binary.size();
766 }
767
768 auto data = fbb->CreateVector<flatbuffers::Offset<schema::ProgramBinary>>(program_binarys);
769 auto name = fbb->CreateString("OpenCLCache");
770 auto version = fbb->CreateString(cache_version_);
771 auto gpu_cache = schema::CreateGpuCache(*fbb, name, version, data);
772 fbb->Finish(gpu_cache);
773 uint8_t *buf = fbb->GetBufferPointer();
774 if (WriteToBin(cache_path_, reinterpret_cast<void *>(buf), fbb->GetSize()) != RET_OK) {
775 MS_LOG(ERROR) << "WriteToBin failed.";
776 return RET_ERROR;
777 }
778 MS_LOG(INFO) << "store opencl cache ok, size=" << fbb->GetSize();
779 return RET_OK;
780 }
781
CreateSharedMemoryBuffer(size_t size,void * host_ptr)782 cl::Buffer *OpenCLRuntime::CreateSharedMemoryBuffer(size_t size, void *host_ptr) {
783 cl_int error = CL_SUCCESS;
784 cl_mem cl_buffer = clImportMemoryARM(context_->get(), CL_MEM_READ_WRITE, NULL, host_ptr, size, &error);
785 if (error != CL_SUCCESS) {
786 MS_LOG(ERROR) << "Create OpenCL shared memory failed for" << CLErrorCode(error);
787 return nullptr;
788 }
789 cl::Buffer *buffer = new (std::nothrow) cl::Buffer(cl_buffer, false);
790 if (buffer == nullptr) {
791 MS_LOG(ERROR) << "New OpenCL Buffer failed";
792 return nullptr;
793 }
794 return buffer;
795 }
796 } // namespace mindspore::lite::opencl
797