1 /**
2 * Copyright 2019 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "src/litert/kernel/gpu/opencl/opencl_runtime.h"
18 #include <vector>
19 #include <numeric>
20 #include <utility>
21
22 #ifdef __ANDROID__
23 #include <dlfcn.h>
24 #endif
25
26 #include "include/errorcode.h"
27 #include "src/litert/kernel/opencl/utils.h"
28 #include "src/litert/kernel/gpu/opencl/opencl_allocator.h"
29 #include "src/common/file_utils.h"
30
31 using mindspore::kernel::CLErrorCode;
32
33 namespace mindspore::lite::opencl {
34 static std::map<std::string, std::string> g_source_map;
35 static std::mutex g_mtx;
36 static std::mutex g_init_mtx;
37
38 InitState OpenCLRuntime::init_state_ = UnInit;
39 OpenCLRuntime *OpenCLRuntime::ocl_runtime_instance_ = nullptr;
40 size_t OpenCLRuntime::instance_count_ = 0;
41
GetInstance()42 OpenCLRuntime *OpenCLRuntime::GetInstance() {
43 std::unique_lock<std::mutex> lck(g_mtx);
44 static OpenCLRuntime ocl_runtime;
45 if (instance_count_ == 0) {
46 ocl_runtime_instance_ = &ocl_runtime;
47 }
48 instance_count_++;
49 return ocl_runtime_instance_;
50 }
51
DeleteInstance()52 void OpenCLRuntime::DeleteInstance() {
53 std::unique_lock<std::mutex> lck(g_mtx);
54 if (instance_count_ == 0) {
55 MS_LOG(ERROR) << "No OpenCLRuntime instance could delete!";
56 return;
57 }
58 instance_count_--;
59 if (instance_count_ == 0) {
60 ocl_runtime_instance_->Uninit();
61 }
62 }
63
printf_callback(const char * buffer,size_t length,size_t final,void * user_data)64 void printf_callback(const char *buffer, size_t length, size_t final, void *user_data) {
65 fwrite(buffer, 1, length, stdout);
66 }
67
InitGPUDevice(std::vector<cl::Platform> * platforms)68 int OpenCLRuntime::InitGPUDevice(std::vector<cl::Platform> *platforms) {
69 MS_ASSERT(platforms);
70 // search GPU
71 std::vector<cl::Device> devices;
72 int ret = RET_OK;
73 for (auto &platform : *platforms) {
74 std::string platform_name;
75 ret = platform.getInfo(CL_PLATFORM_NAME, &platform_name);
76 if (ret != CL_SUCCESS) {
77 MS_LOG(WARNING) << CLErrorCode(ret);
78 }
79 ret = platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
80 if (ret != CL_SUCCESS) {
81 MS_LOG(WARNING) << CLErrorCode(ret);
82 }
83 MS_LOG(INFO) << "Platform (" << platform_name << ") has " << devices.size() << " GPUs";
84
85 if (!devices.empty()) {
86 std::string device_name = devices[0].getInfo<CL_DEVICE_NAME>();
87 MS_LOG(INFO) << "Find GPU: " << device_name.c_str();
88 cl::Platform::setDefault(platform);
89 break;
90 }
91 }
92
93 // not found, return error code.
94 if (devices.empty()) {
95 MS_LOG(ERROR) << "OpenCL Device not found!";
96 return RET_ERROR;
97 }
98
99 device_ = new (std::nothrow) cl::Device();
100 if (device_ == nullptr) {
101 MS_LOG(ERROR) << "Create OpenCL device failed!";
102 return RET_ERROR;
103 }
104 *device_ = devices[0];
105 max_work_item_sizes_ = device_->getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>();
106 max_work_group_size_ = max_work_item_sizes_[0];
107 const std::string device_name = device_->getInfo<CL_DEVICE_NAME>();
108 const std::string device_version = device_->getInfo<CL_DEVICE_VERSION>();
109 const std::string opencl_version = device_->getInfo<CL_DEVICE_OPENCL_C_VERSION>();
110 clGetDeviceInfo((*device_)(), CL_DEVICE_IMAGE_PITCH_ALIGNMENT, sizeof(cl_uint), &image_pitch_align_, nullptr);
111 MS_LOG(INFO) << "Device name:\t" << device_name;
112 MS_LOG(INFO) << "Opencl version:\t" << device_version;
113 MS_LOG(INFO) << "Image pitch alignment:\t" << image_pitch_align_;
114 MS_LOG(INFO) << "Highest OpenCL c version:\t" << opencl_version;
115 MS_LOG(INFO) << "Max work item size:\t" << max_work_item_sizes_[0] << " : " << max_work_item_sizes_[1] << " : "
116 << max_work_item_sizes_[2];
117
118 gpu_info_ = ParseGpuInfo(device_name, device_version);
119 // get cache size, compute units and frequency.
120 ret = device_->getInfo(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, &global_memery_cachesize_);
121 if (ret != CL_SUCCESS) {
122 MS_LOG(WARNING) << CLErrorCode(ret);
123 }
124 ret = device_->getInfo(CL_DEVICE_MAX_COMPUTE_UNITS, &compute_units_);
125 if (ret != CL_SUCCESS) {
126 MS_LOG(WARNING) << CLErrorCode(ret);
127 }
128 ret = device_->getInfo(CL_DEVICE_MAX_CLOCK_FREQUENCY, &max_freq_);
129 if (ret != CL_SUCCESS) {
130 MS_LOG(WARNING) << CLErrorCode(ret);
131 }
132 cl_device_fp_config fp_config;
133 auto success = device_->getInfo(CL_DEVICE_HALF_FP_CONFIG, &fp_config);
134 support_fp16_ = CL_SUCCESS == success && fp_config > 0;
135
136 ret = device_->getInfo(CL_DEVICE_SVM_CAPABILITIES, &svm_capabilities_);
137 if (ret != CL_SUCCESS || svm_capabilities_ == 0) {
138 svm_capabilities_ = 0;
139 MS_LOG(INFO) << "SVM capalibilties: "
140 << "NONE";
141 } else {
142 if (svm_capabilities_ & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) {
143 MS_LOG(INFO) << "SVM capalibilties: "
144 << "SVM_FINE_GRAIN_BUFFER";
145 }
146 if (svm_capabilities_ & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) {
147 MS_LOG(INFO) << "SVM capalibilties: "
148 << "SVM_COARSE_GRAIN_BUFFER";
149 }
150 if (svm_capabilities_ & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) {
151 MS_LOG(INFO) << "SVM capalibilties: "
152 << "SVM_COARSE_GRAIN_SYSTEM";
153 }
154 if (svm_capabilities_ & CL_DEVICE_SVM_ATOMICS) {
155 MS_LOG(INFO) << "SVM capalibilties: "
156 << "SVM_ATOMICS";
157 }
158 }
159 global_memery_size_ = device_->getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>();
160 max_alloc_size_ = device_->getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>();
161 max_image2d_width_ = device_->getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
162 max_image2d_height_ = device_->getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
163 supported_extensions_ = std::string(device_->getInfo<CL_DEVICE_EXTENSIONS>());
164 cache_line_size_ = device_->getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
165 MS_LOG(INFO) << "Address space bits: " << device_->getInfo<CL_DEVICE_ADDRESS_BITS>();
166 MS_LOG(INFO) << "Global Mem Size: " << global_memery_size_;
167 MS_LOG(INFO) << "Global Mem Cache Size: " << global_memery_cachesize_;
168 MS_LOG(INFO) << "Max Alloc Size: " << max_alloc_size_;
169 MS_LOG(INFO) << "Compute Unit: " << compute_units_;
170 MS_LOG(INFO) << "Clock Frequency: " << max_freq_ << " MHz";
171 return RET_OK;
172 }
173
InitQueue(std::vector<cl::Platform> * platforms)174 int OpenCLRuntime::InitQueue(std::vector<cl::Platform> *platforms) {
175 MS_ASSERT(platforms);
176 cl_int ret = 0;
177 #if defined(CL_HPP_TARGET_OPENCL_VERSION) && (CL_HPP_TARGET_OPENCL_VERSION >= 120)
178 // create context from glcontext
179 if (this->GetGLTextureEnable()) {
180 // create context from glcontext
181 MS_LOG(INFO) << "Create special opencl context to share with OpenGL";
182
183 if (!CheckGLContext()) {
184 MS_LOG(ERROR) << "GL Context error, please check glcontext config";
185 return RET_ERROR;
186 }
187 if (!CheckGLDisplay()) {
188 MS_LOG(ERROR) << "GL Display error, please check gldisplay config";
189 return RET_ERROR;
190 }
191
192 cl_context_properties context_prop[] = {CL_GL_CONTEXT_KHR, (cl_context_properties)GetGLContext(),
193 CL_EGL_DISPLAY_KHR, (cl_context_properties)GetGLDisplay(), 0};
194 context_ = new (std::nothrow) cl::Context(std::vector<cl::Device>{*device_}, context_prop, nullptr, nullptr, &ret);
195 if (context_ == nullptr || ret != CL_SUCCESS) {
196 MS_LOG(ERROR)
197 << "Create special OpenCL context failed, The device unspport Sharing or OpenGL Context is not Init";
198 this->enable_gl_texture_ = false;
199 if (context_ != nullptr) {
200 delete context_;
201 }
202 return RET_ERROR;
203 }
204 } else {
205 context_ = new (std::nothrow) cl::Context(std::vector<cl::Device>{*device_}, nullptr, nullptr, nullptr, &ret);
206 if (context_ == nullptr) {
207 delete device_;
208 MS_LOG(ERROR) << "Create OpenCL context failed!";
209 return RET_ERROR;
210 }
211 }
212 #else
213 MS_LOG(INFO) << "Create common opencl context";
214 #ifdef Debug
215 std::vector<cl_context_properties> ctx_properties = {CL_CONTEXT_PLATFORM,
216 (cl_context_properties)(*platforms)[0](),
217 CL_PRINTF_CALLBACK_ARM,
218 (cl_context_properties)printf_callback,
219 CL_PRINTF_BUFFERSIZE_ARM,
220 0x1000000,
221 0};
222 context_ =
223 new (std::nothrow) cl::Context(std::vector<cl::Device>{*device_}, ctx_properties.data(), nullptr, nullptr, &ret);
224 if (context_ == nullptr || ret != CL_SUCCESS) {
225 context_ = new (std::nothrow) cl::Context(std::vector<cl::Device>{*device_}, nullptr, nullptr, nullptr, &ret);
226 }
227 #else
228 context_ = new (std::nothrow) cl::Context(std::vector<cl::Device>{*device_}, nullptr, nullptr, nullptr, &ret);
229 #endif
230 #endif
231 if (context_ == nullptr || ret != CL_SUCCESS) {
232 delete device_;
233 device_ = nullptr;
234 MS_LOG(ERROR) << "Context create failed: " << CLErrorCode(ret);
235 return RET_ERROR;
236 }
237
238 default_command_queue_ = new (std::nothrow) cl::CommandQueue(*context_, *device_, 0, &ret);
239 if (default_command_queue_ == nullptr || ret != CL_SUCCESS) {
240 delete device_;
241 delete context_;
242 device_ = nullptr;
243 context_ = nullptr;
244 MS_LOG(ERROR) << "Command Queue create failed: " << CLErrorCode(ret);
245 return RET_ERROR;
246 }
247
248 profiling_command_queue_ = new (std::nothrow) cl::CommandQueue(*context_, *device_, CL_QUEUE_PROFILING_ENABLE, &ret);
249 if (profiling_command_queue_ == nullptr || ret != CL_SUCCESS) {
250 delete device_;
251 delete context_;
252 delete default_command_queue_;
253 device_ = nullptr;
254 context_ = nullptr;
255 default_command_queue_ = nullptr;
256 MS_LOG(ERROR) << "Profiling command Queue create failed: " << CLErrorCode(ret);
257 return RET_ERROR;
258 }
259 return RET_OK;
260 }
261
262 // Init will get platforms info, get devices info, create opencl context.
Init()263 int OpenCLRuntime::Init() {
264 std::unique_lock<std::mutex> lck(g_init_mtx);
265 if (init_state_ == InitSuccess) {
266 return RET_OK;
267 } else if (init_state_ == InitFailed) {
268 return RET_ERROR;
269 }
270 init_state_ = InitFailed;
271
272 MS_LOG(INFO) << "OpenCL version: CL_TARGET_OPENCL_VERSION " << CL_TARGET_OPENCL_VERSION;
273 MS_LOG(INFO) << "CL_HPP_TARGET_OPENCL_VERSION " << CL_HPP_TARGET_OPENCL_VERSION;
274 MS_LOG(INFO) << "CL_HPP_MINIMUM_OPENCL_VERSION " << CL_HPP_MINIMUM_OPENCL_VERSION;
275
276 #ifdef USE_OPENCL_WRAPPER
277 if (!lite::opencl::LoadOpenCLLibrary(&handle_)) {
278 MS_LOG(ERROR) << "Load OpenCL symbols failed!";
279 return RET_ERROR;
280 }
281 #endif // USE_OPENCL_WRAPPER
282 std::vector<cl::Platform> platforms;
283 cl_int ret = cl::Platform::get(&platforms);
284 if (platforms.empty()) {
285 MS_LOG(ERROR) << "OpenCL Platform not found!" << CLErrorCode(ret);
286 return RET_ERROR;
287 }
288 auto ms_ret = InitGPUDevice(&platforms);
289 if (ms_ret != RET_OK) {
290 return ms_ret;
291 }
292
293 #ifdef __ANDROID__
294 // only support mali device.
295 if (gpu_info_.type == MALI || gpu_info_.type == MALI_T || gpu_info_.type == MALI_G || gpu_info_.type == MALI_G78) {
296 clImportMemoryARM = reinterpret_cast<clImportMemoryARMFunc>(dlsym(handle_, "clImportMemoryARM"));
297 if (clImportMemoryARM == nullptr) {
298 MS_LOG(ERROR) << "load func (clImportMemoryARM) failed!";
299 UnLoadOpenCLLibrary(handle_);
300 return false;
301 }
302 }
303
304 // only gltexture enable, load clCreateFromGLTexture func
305 if (this->GetGLTextureEnable()) {
306 clCreateFromGLTexture = reinterpret_cast<clCreateFromGLTextureFunc>(dlsym(handle_, "clCreateFromGLTexture"));
307 if (clCreateFromGLTexture == nullptr) {
308 MS_LOG(ERROR) << "load func (clCreateFromGLTexture) failed!";
309 UnLoadOpenCLLibrary(handle_);
310 return false;
311 }
312 }
313 #endif
314
315 ms_ret = InitQueue(&platforms);
316 if (ms_ret != RET_OK) {
317 return ms_ret;
318 }
319
320 allocator_ = std::make_shared<OpenCLAllocator>(this);
321 if (allocator_ == nullptr) {
322 delete device_;
323 delete context_;
324 delete default_command_queue_;
325 delete profiling_command_queue_;
326 device_ = nullptr;
327 context_ = nullptr;
328 default_command_queue_ = nullptr;
329 profiling_command_queue_ = nullptr;
330 MS_LOG(ERROR) << "Command OpenCL allocator failed!";
331 return RET_ERROR;
332 }
333 LoadCache();
334 init_state_ = InitSuccess;
335 MS_LOG(INFO) << "OpenCLRuntime init done!";
336 return RET_OK;
337 }
338
Uninit()339 int OpenCLRuntime::Uninit() {
340 std::unique_lock<std::mutex> lck(g_init_mtx);
341 if (init_state_ != InitSuccess) {
342 return RET_OK;
343 }
344 if (StoreCache() != RET_OK) {
345 MS_LOG(ERROR) << "StoreCache failed!";
346 }
347 program_map_.clear();
348 delete default_command_queue_;
349 delete profiling_command_queue_;
350 delete context_;
351 delete device_;
352 allocator_ = nullptr;
353 default_command_queue_ = nullptr;
354 profiling_command_queue_ = nullptr;
355 context_ = nullptr;
356 device_ = nullptr;
357 init_state_ = UnInit;
358 return RET_OK;
359 }
360
~OpenCLRuntime()361 OpenCLRuntime::~OpenCLRuntime() { Uninit(); }
362
Context()363 cl::Context *OpenCLRuntime::Context() { return context_; }
364
Device()365 cl::Device *OpenCLRuntime::Device() { return device_; }
366
DeviceGlobalMemoryCacheSize() const367 uint64_t OpenCLRuntime::DeviceGlobalMemoryCacheSize() const { return global_memery_cachesize_; }
368
DeviceMaxWorkGroupSize() const369 uint64_t OpenCLRuntime::DeviceMaxWorkGroupSize() const { return max_work_group_size_; }
370
DeviceComputeUnits() const371 uint32_t OpenCLRuntime::DeviceComputeUnits() const { return compute_units_; }
372
DeviceMaxFreq() const373 uint32_t OpenCLRuntime::DeviceMaxFreq() const { return max_freq_; }
374
375 // get kernel enqueue max work group size
GetMaxWorkGroupSize(const cl::Kernel & kernel)376 uint64_t OpenCLRuntime::GetMaxWorkGroupSize(const cl::Kernel &kernel) {
377 uint64_t max_workgroup_size = 0;
378 int ret = kernel.getWorkGroupInfo(*device_, CL_KERNEL_WORK_GROUP_SIZE, &max_workgroup_size);
379 if (ret != CL_SUCCESS) {
380 max_workgroup_size = 0;
381 }
382 return max_workgroup_size;
383 }
384
385 // opencl 2.0 can get SubGroupSize.
GetSubGroupSize(const cl::Kernel & kernel,const cl::NDRange & range)386 uint32_t OpenCLRuntime::GetSubGroupSize(const cl::Kernel &kernel, const cl::NDRange &range) {
387 uint32_t sub_group_size = 0;
388
389 if (ADRENO == gpu_info_.type) {
390 #if defined(CL_HPP_TARGET_OPENCL_VERSION) && CL_HPP_TARGET_OPENCL_VERSION >= 200 && \
391 defined(CL_TARGET_OPENCL_VERSION) && CL_TARGET_OPENCL_VERSION >= 210 && defined(CL_HPP_USE_CL_SUB_GROUPS_KHR)
392 cl_int cl_ret;
393 sub_group_size = kernel.getSubGroupInfo<CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE>(*device_, range, &cl_ret);
394 if (cl_ret != CL_SUCCESS) {
395 CHECK_CL_SUCCESS(cl_ret)
396 sub_group_size = 0;
397 }
398 #else
399 sub_group_size = 0;
400 #endif
401 }
402
403 return sub_group_size;
404 }
405
GetGpuInfo()406 GpuInfo OpenCLRuntime::GetGpuInfo() { return gpu_info_; }
407
GetFp16Enable() const408 bool OpenCLRuntime::GetFp16Enable() const { return fp16_enable_; }
409
410 // if support fp16, set fp16 will success.
SetFp16Enable(bool enable)411 bool OpenCLRuntime::SetFp16Enable(bool enable) {
412 fp16_enable_ = enable && support_fp16_;
413 return fp16_enable_ == enable;
414 }
415
SetGLTextureEnable(bool enable)416 bool OpenCLRuntime::SetGLTextureEnable(bool enable) {
417 enable_gl_texture_ = enable;
418 return enable_gl_texture_ == enable;
419 }
420
GetGLTextureEnable() const421 bool OpenCLRuntime::GetGLTextureEnable() const { return enable_gl_texture_; }
422
BuildKernel(const cl::Kernel & kernel,const std::string & program_name,const std::string & kernel_name,const std::vector<std::string> & build_options_ext,const bool is_builtin)423 int OpenCLRuntime::BuildKernel(const cl::Kernel &kernel, const std::string &program_name,
424 const std::string &kernel_name, const std::vector<std::string> &build_options_ext,
425 const bool is_builtin) {
426 std::string build_option;
427 if (is_builtin) {
428 build_option = default_build_option_;
429 if (fp16_enable_) {
430 build_option +=
431 " -DFP16_ENABLE=1 -DFLT=half -DFLT4=half4 -DFLT16=half16 -DAS_FLT4=as_half4 -DAS_UINT4=as_ushort4 "
432 "-DUINT4=ushort4"
433 " -DTO_FLT=convert_half -DTO_FLT4=convert_half4";
434 } else {
435 build_option +=
436 " -DFP16_ENABLE=0 -DFLT=float -DFLT4=float4 -DFLT16=float16 -DAS_FLT4=as_float4 -DAS_UINT4=as_uint4 "
437 "-DUINT4=uint4"
438 " -DTO_FLT=convert_float -DTO_FLT4=convert_float4";
439 }
440 build_option += " -DMAX_IMAGE2D_WIDTH=" + std::to_string(max_image2d_width_);
441 }
442 build_option =
443 std::accumulate(build_options_ext.begin(), build_options_ext.end(), build_option,
444 [](const std::string &options, const std::string &option) { return options + " " + option; });
445
446 cl::Program program;
447 auto program_key = std::make_pair(program_name, build_option);
448 auto iter = program_map_.find(program_key);
449 if (iter != program_map_.end()) {
450 program = iter->second;
451 } else {
452 flush_cache_ = true;
453 auto status = this->LoadProgram(program_name, &program);
454 if (!status) {
455 MS_LOG(ERROR) << "load program (" << program_name << ") failed!";
456 return RET_ERROR;
457 }
458 status = this->BuildProgram(build_option, program);
459 if (!status) {
460 MS_LOG(ERROR) << program_name << " build failed!";
461 return RET_ERROR;
462 }
463 program_map_.emplace(program_key, program);
464 }
465
466 cl_int ret;
467 const_cast<cl::Kernel &>(kernel) = cl::Kernel(program, kernel_name.c_str(), &ret);
468 if (ret != CL_SUCCESS) {
469 MS_LOG(ERROR) << kernel_name << " Kernel create failed:" << CLErrorCode(ret);
470 return RET_ERROR;
471 }
472 return RET_OK;
473 }
474
475 // Run Kernel with 1D, 2D, 3D group size, and local size can be empty.
RunKernel(const cl::Kernel & kernel,const cl::NDRange & global,const cl::NDRange & local,cl::CommandQueue * command_queue,cl::Event * event)476 int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const cl::NDRange &global, const cl::NDRange &local,
477 cl::CommandQueue *command_queue, cl::Event *event) {
478 if (command_queue == nullptr) {
479 if (profiling_) {
480 command_queue = profiling_command_queue_;
481 } else {
482 command_queue = default_command_queue_;
483 }
484 }
485 MS_ASSERT(local.size() == 0 || local.size() == global.size());
486 cl_int ret = CL_SUCCESS;
487 ret = command_queue->enqueueNDRangeKernel(kernel, cl::NullRange, global, local, nullptr, event);
488 if (ret != CL_SUCCESS) {
489 MS_LOG(ERROR) << "Kernel execute failed:" << CLErrorCode(ret);
490 return RET_ERROR;
491 }
492 static int cnt = 0;
493 const int flush_period = 10;
494 if (MALI_G78 == gpu_info_.type) {
495 auto flush_ret = command_queue->flush();
496 if (flush_ret != CL_SUCCESS) {
497 MS_LOG(WARNING) << "CL Flush failed:" << CLErrorCode(ret);
498 }
499 } else {
500 if (cnt % flush_period == 0) {
501 auto flush_ret = command_queue->flush();
502 if (flush_ret != CL_SUCCESS) {
503 MS_LOG(WARNING) << "CL Flush failed:" << CLErrorCode(ret);
504 }
505 }
506 }
507
508 cnt++;
509 MS_LOG(DEBUG) << "RunKernel success!";
510 if (profiling_) {
511 event->wait();
512 }
513 return RET_OK;
514 }
515
516 // get gpu divce type
ParseGpuInfo(std::string device_name,std::string device_version)517 GpuInfo OpenCLRuntime::ParseGpuInfo(std::string device_name, std::string device_version) {
518 GpuInfo info;
519 if (device_name == "QUALCOMM Adreno(TM)") {
520 info.type = ADRENO;
521 } else if (device_name.find("Mali") != std::string::npos) {
522 info.type = MALI;
523 // Mali type MALI-G or MALI_T
524 if (device_name.find("Mali-G") != std::string::npos) {
525 info.type = MALI_G;
526 if (device_name.find("Mali-G78") != std::string::npos) {
527 info.type = MALI_G78;
528 }
529 } else if (device_name.find("Mali-T") != std::string::npos) {
530 info.type = MALI_T;
531 }
532 }
533 return info;
534 }
535
LoadSource(const std::string & program_name,const std::string & source)536 bool OpenCLRuntime::LoadSource(const std::string &program_name, const std::string &source) {
537 auto it_source = g_source_map.find(program_name);
538 if (it_source == g_source_map.end()) {
539 g_source_map.emplace(program_name, source);
540 }
541 return true;
542 }
543
544 // load program with program name.
LoadProgram(const std::string & program_name,cl::Program * program)545 bool OpenCLRuntime::LoadProgram(const std::string &program_name, cl::Program *program) {
546 auto it_source = g_source_map.find(program_name);
547 if (it_source != g_source_map.end()) {
548 cl::Program::Sources sources;
549 sources.push_back(it_source->second);
550 *program = cl::Program(*context_, sources);
551 return true;
552 } else {
553 MS_LOG(ERROR) << "Can't find kernel source !";
554 return false;
555 }
556 }
557
558 // build program with build options
BuildProgram(const std::string & build_option,const cl::Program & program)559 bool OpenCLRuntime::BuildProgram(const std::string &build_option, const cl::Program &program) {
560 cl_int ret = program.build({*device_}, build_option.c_str());
561 if (ret != CL_SUCCESS) {
562 if (program.getBuildInfo<CL_PROGRAM_BUILD_STATUS>(*device_) == CL_BUILD_ERROR) {
563 std::string build_log = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(*device_);
564 MS_LOG(ERROR) << "Program build log: " << build_log;
565 }
566 MS_LOG(ERROR) << "Build program failed: " << CLErrorCode(ret);
567 return false;
568 }
569 return true;
570 }
571
ReadOrWriteImage(void * buffer,void * data,bool is_read)572 int OpenCLRuntime::ReadOrWriteImage(void *buffer, void *data, bool is_read) {
573 cl::CommandQueue *command_queue = profiling_ ? profiling_command_queue_ : default_command_queue_;
574 auto *image = allocator_->GetImage(buffer);
575 if (image == nullptr) {
576 MS_LOG(WARNING) << "Can't get Image2D for " << buffer;
577 return RET_ERROR;
578 }
579 ImageSize img_size;
580 int ret = allocator_->GetImageSize(buffer, &img_size);
581 if (ret != RET_OK) {
582 MS_LOG(WARNING) << "Can't get GetImageSize for " << buffer;
583 return RET_ERROR;
584 }
585 cl::array<size_t, 3> origin = {0, 0, 0};
586 cl::array<size_t, 3> region = {img_size.width, img_size.height, 1};
587 if (is_read) {
588 ret = command_queue->enqueueReadImage(*image, true, origin, region, 0, 0, data, nullptr, nullptr);
589 } else {
590 ret = command_queue->enqueueWriteImage(*image, true, origin, region, 0, 0, data, nullptr, nullptr);
591 }
592 if (ret != CL_SUCCESS) {
593 MS_LOG(ERROR) << CLErrorCode(ret);
594 return RET_ERROR;
595 }
596 return RET_OK;
597 }
598
ReadImage(void * buffer,void * dst_data)599 int OpenCLRuntime::ReadImage(void *buffer, void *dst_data) { return ReadOrWriteImage(buffer, dst_data, true); }
600
WriteImage(void * buffer,void * src_data)601 int OpenCLRuntime::WriteImage(void *buffer, void *src_data) { return ReadOrWriteImage(buffer, src_data, false); }
602
CopyDeviceMemToHost(void * dst,const void * src,size_t size,cl::CommandQueue * command_queue,bool sync) const603 bool OpenCLRuntime::CopyDeviceMemToHost(void *dst, const void *src, size_t size, cl::CommandQueue *command_queue,
604 bool sync) const {
605 if (command_queue == nullptr) {
606 command_queue = default_command_queue_;
607 }
608 cl_int cl_ret = CL_SUCCESS;
609 const cl::Buffer *buffer = static_cast<const cl::Buffer *>(src);
610 if (command_queue != nullptr) {
611 cl_ret = command_queue->enqueueReadBuffer(*buffer, sync, 0, size, dst);
612 }
613 return cl_ret == CL_SUCCESS;
614 }
615
CopyHostMemToDevice(const void * dst,const void * src,size_t size,cl::CommandQueue * command_queue,bool sync) const616 bool OpenCLRuntime::CopyHostMemToDevice(const void *dst, const void *src, size_t size, cl::CommandQueue *command_queue,
617 bool sync) const {
618 if (command_queue == nullptr) {
619 command_queue = default_command_queue_;
620 }
621 cl_int cl_ret = CL_SUCCESS;
622 const cl::Buffer *buffer = static_cast<const cl::Buffer *>(dst);
623 if (command_queue != nullptr) {
624 cl_ret = command_queue->enqueueWriteBuffer(*buffer, sync, 0, size, src);
625 }
626 return cl_ret == CL_SUCCESS;
627 }
628
MapBuffer(const cl::Buffer & buffer,int flags,size_t size,cl::CommandQueue * command_queue,bool sync) const629 void *OpenCLRuntime::MapBuffer(const cl::Buffer &buffer, int flags, size_t size, cl::CommandQueue *command_queue,
630 bool sync) const {
631 if (command_queue == nullptr) {
632 command_queue = default_command_queue_;
633 }
634 return command_queue->enqueueMapBuffer(buffer, sync, flags, 0, size);
635 }
636
MapBuffer(void * host_ptr,int flags,size_t size,cl::CommandQueue * command_queue,bool sync) const637 int OpenCLRuntime::MapBuffer(void *host_ptr, int flags, size_t size, cl::CommandQueue *command_queue, bool sync) const {
638 if (GetSVMCapabilities() & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) {
639 return RET_ERROR;
640 }
641 if (command_queue == nullptr) {
642 command_queue = default_command_queue_;
643 }
644 if (clEnqueueSVMMap(command_queue->get(), sync, flags, host_ptr, size, 0, nullptr, nullptr) != CL_SUCCESS) {
645 return RET_ERROR;
646 }
647 return RET_OK;
648 }
649
MapBuffer(const cl::Image2D & buffer,bool sync,int flags,const std::vector<size_t> & region,cl::CommandQueue * command_queue) const650 void *OpenCLRuntime::MapBuffer(const cl::Image2D &buffer, bool sync, int flags, const std::vector<size_t> ®ion,
651 cl::CommandQueue *command_queue) const {
652 if (command_queue == nullptr) {
653 command_queue = default_command_queue_;
654 }
655 cl::size_type row_pitch;
656 cl::size_type slice_pitch;
657 cl::array<cl::size_type, 3> origin_{0, 0, 0};
658 cl::array<cl::size_type, 3> region_{region[0], region[1], region[2]};
659 return command_queue->enqueueMapImage(buffer, sync, flags, origin_, region_, &row_pitch, &slice_pitch);
660 }
661
UnmapBuffer(const cl::Memory & buffer,void * host_ptr,cl::CommandQueue * command_queue) const662 int OpenCLRuntime::UnmapBuffer(const cl::Memory &buffer, void *host_ptr, cl::CommandQueue *command_queue) const {
663 if (command_queue == nullptr) {
664 command_queue = default_command_queue_;
665 }
666 return command_queue->enqueueUnmapMemObject(buffer, host_ptr);
667 }
668
UnmapBuffer(void * host_ptr,cl::CommandQueue * command_queue) const669 int OpenCLRuntime::UnmapBuffer(void *host_ptr, cl::CommandQueue *command_queue) const {
670 if (GetSVMCapabilities() & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) {
671 return RET_OK;
672 }
673 if (command_queue == nullptr) {
674 command_queue = default_command_queue_;
675 }
676 return clEnqueueSVMUnmap(command_queue->get(), host_ptr, 0, nullptr, nullptr);
677 }
678
SyncCommandQueue(cl::CommandQueue * command_queue)679 bool OpenCLRuntime::SyncCommandQueue(cl::CommandQueue *command_queue) {
680 if (command_queue == nullptr) {
681 command_queue = default_command_queue_;
682 }
683 cl_int ret = command_queue->finish();
684 if (ret != CL_SUCCESS) {
685 MS_LOG(ERROR) << "Command queue sync failed: " << CLErrorCode(ret);
686 return RET_ERROR;
687 }
688 return ret == CL_SUCCESS;
689 }
690
GetKernelMaxWorkGroupSize(cl_kernel kernel,cl_device_id device_id)691 int OpenCLRuntime::GetKernelMaxWorkGroupSize(cl_kernel kernel, cl_device_id device_id) {
692 size_t max_work_group_size;
693 cl_int ret = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t),
694 &max_work_group_size, nullptr);
695 if (ret != CL_SUCCESS) {
696 MS_LOG(ERROR) << "Failed to get info CL_KERNEL_WORK_GROUP_SIZE " << CLErrorCode(ret);
697 }
698 return static_cast<int>(max_work_group_size);
699 }
700
GetKernelFromBinary(const std::string & kernel_name)701 cl::Kernel OpenCLRuntime::GetKernelFromBinary(const std::string &kernel_name) {
702 cl_int ret = CL_SUCCESS;
703 cl::Kernel kernel = cl::Kernel(binary_program_, kernel_name.c_str(), &ret);
704 if (ret != CL_SUCCESS) {
705 MS_LOG(ERROR) << "Create kernel with binary program failed: " << CLErrorCode(ret);
706 }
707 return kernel;
708 }
709
710 // build program with IL
CreateProgramFromIL(const std::vector<char> & binary,const std::string & flag)711 cl::Program OpenCLRuntime::CreateProgramFromIL(const std::vector<char> &binary, const std::string &flag) {
712 #if defined(CL_HPP_TARGET_OPENCL_VERSION) && CL_HPP_TARGET_OPENCL_VERSION >= 210
713 cl::Program program = cl::Program(*context_, binary);
714 bool status = BuildProgram(default_build_opts_, program);
715 if (!status) {
716 MS_LOG(ERROR) << "Build program with IL failed!";
717 }
718 return program;
719 #else
720 MS_LOG(ERROR) << "Create program with IL failed! The compute capabitity of device should be 2.1 and higher.";
721 return cl::Program();
722 #endif
723 }
724
725 // build program with binary
CreateProgramFromBinary(const std::vector<unsigned char> & binary,const std::string & build_option)726 cl::Program OpenCLRuntime::CreateProgramFromBinary(const std::vector<unsigned char> &binary,
727 const std::string &build_option) {
728 cl::Program program = cl::Program(*context_, {*device_}, {binary});
729 bool status = BuildProgram(build_option, program);
730 if (!status) {
731 MS_LOG(ERROR) << "Build program with binary failed!";
732 }
733 return program;
734 }
735
GetProgramBinary(const cl::Program & program)736 std::vector<unsigned char> OpenCLRuntime::GetProgramBinary(const cl::Program &program) {
737 cl_int ret = CL_SUCCESS;
738 auto binarys = program.getInfo<CL_PROGRAM_BINARIES>(&ret);
739 if (ret != CL_SUCCESS) {
740 MS_LOG(ERROR) << "Get program binary failed: " << CLErrorCode(ret);
741 }
742 if (binarys.empty()) {
743 MS_LOG(ERROR) << "binarys is empty";
744 return {};
745 }
746 return binarys.front();
747 }
748
LoadCache()749 void OpenCLRuntime::LoadCache() {
750 if (!enable_cache_) {
751 return;
752 }
753 size_t len;
754 std::unique_ptr<char[]> buf(lite::ReadFile(cache_path_.c_str(), &len));
755 if (buf == nullptr) {
756 MS_LOG(ERROR) << "Load opencl cache fail: buf == nullptr";
757 return;
758 }
759 auto gpu_cache = schema::GetGpuCache(buf.get());
760 if (gpu_cache == nullptr) {
761 MS_LOG(ERROR) << "Load opencl cache fail: gpu_cache == nullptr";
762 return;
763 }
764 auto *bins = gpu_cache->allBins();
765 if (bins == nullptr) {
766 MS_LOG(ERROR) << "Load opencl cache fail: bins == nullptr";
767 return;
768 }
769 for (size_t i = 0; i < bins->size(); ++i) {
770 auto *bin = bins->template GetAs<schema::ProgramBinary>(i);
771 if (bin == nullptr) {
772 MS_LOG(ERROR) << "kernel_bin[" << i << "] null";
773 return;
774 }
775 auto *pdata = bin->data();
776 MS_ASSERT(pdata);
777 if (pdata->size() == 0) {
778 continue;
779 }
780 std::vector<unsigned char> binary(pdata->begin(), pdata->end());
781 auto program = CreateProgramFromBinary(binary, bin->build_option()->str());
782 program_map_.emplace(std::make_pair(bin->program_name()->str(), bin->build_option()->str()), program);
783 MS_LOG(INFO) << "LoadCache " << bin->program_name() << " success, size=" << binary.size();
784 }
785 MS_LOG(INFO) << "Init opencl cache success";
786 }
787
StoreCache()788 int OpenCLRuntime::StoreCache() {
789 if (!enable_cache_) {
790 return RET_OK;
791 }
792 if (!flush_cache_) {
793 return RET_OK;
794 }
795 auto fbb = std::make_unique<flatbuffers::FlatBufferBuilder>();
796 if (fbb == nullptr) {
797 MS_LOG(ERROR) << "new opencl FlatBufferBuilder fail";
798 return RET_ERROR;
799 }
800 std::vector<flatbuffers::Offset<schema::ProgramBinary>> program_binarys;
801 for (const auto &kv : program_map_) {
802 auto program_name = kv.first.first;
803 auto build_option = kv.first.second;
804 cl::Program program = kv.second;
805 auto binary = this->GetProgramBinary(program);
806 std::vector<int32_t> shape;
807 auto tune = schema::CreateTuneParam(*fbb, fbb->CreateVector<int32_t>(shape), fbb->CreateVector<int32_t>(shape),
808 fbb->CreateVector<int32_t>(shape), fbb->CreateVector<int32_t>(shape));
809 auto program_binary = schema::CreateProgramBinary(
810 *fbb, fbb->CreateString(program_name), fbb->CreateString(build_option), tune, fbb->CreateVector<uint8_t>(binary));
811 program_binarys.emplace_back(program_binary);
812 MS_LOG(INFO) << "StoreCache " << program_name << " success, size=" << binary.size();
813 }
814
815 auto data = fbb->CreateVector<flatbuffers::Offset<schema::ProgramBinary>>(program_binarys);
816 auto name = fbb->CreateString("OpenCLCache");
817 auto version = fbb->CreateString(cache_version_);
818 auto gpu_cache = schema::CreateGpuCache(*fbb, name, version, data);
819 fbb->Finish(gpu_cache);
820 uint8_t *buf = fbb->GetBufferPointer();
821 if (WriteToBin(cache_path_, reinterpret_cast<void *>(buf), fbb->GetSize()) != RET_OK) {
822 MS_LOG(ERROR) << "WriteToBin failed.";
823 return RET_ERROR;
824 }
825 MS_LOG(INFO) << "store opencl cache ok, size=" << fbb->GetSize();
826 return RET_OK;
827 }
828
CreateSharedMemoryBuffer(size_t size,void * host_ptr)829 cl::Buffer *OpenCLRuntime::CreateSharedMemoryBuffer(size_t size, void *host_ptr) {
830 cl_int error = CL_SUCCESS;
831 cl_mem cl_buffer = clImportMemoryARM(context_->get(), CL_MEM_READ_WRITE, NULL, host_ptr, size, &error);
832 if (error != CL_SUCCESS) {
833 MS_LOG(ERROR) << "Create OpenCL shared memory failed for" << CLErrorCode(error);
834 return nullptr;
835 }
836 cl::Buffer *buffer = new (std::nothrow) cl::Buffer(cl_buffer, false);
837 if (buffer == nullptr) {
838 MS_LOG(ERROR) << "New OpenCL Buffer failed";
839 return nullptr;
840 }
841 return buffer;
842 }
843 } // namespace mindspore::lite::opencl
844