• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2019 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "src/litert/kernel/gpu/opencl/opencl_runtime.h"
18 #include <vector>
19 #include <numeric>
20 #include <utility>
21 
22 #ifdef __ANDROID__
23 #include <dlfcn.h>
24 #endif
25 
26 #include "include/errorcode.h"
27 #include "src/litert/kernel/opencl/utils.h"
28 #include "src/litert/kernel/gpu/opencl/opencl_allocator.h"
29 #include "src/common/file_utils.h"
30 
31 using mindspore::kernel::CLErrorCode;
32 
33 namespace mindspore::lite::opencl {
34 static std::map<std::string, std::string> g_source_map;
35 static std::mutex g_mtx;
36 static std::mutex g_init_mtx;
37 
38 InitState OpenCLRuntime::init_state_ = UnInit;
39 OpenCLRuntime *OpenCLRuntime::ocl_runtime_instance_ = nullptr;
40 size_t OpenCLRuntime::instance_count_ = 0;
41 
GetInstance()42 OpenCLRuntime *OpenCLRuntime::GetInstance() {
43   std::unique_lock<std::mutex> lck(g_mtx);
44   static OpenCLRuntime ocl_runtime;
45   if (instance_count_ == 0) {
46     ocl_runtime_instance_ = &ocl_runtime;
47   }
48   instance_count_++;
49   return ocl_runtime_instance_;
50 }
51 
DeleteInstance()52 void OpenCLRuntime::DeleteInstance() {
53   std::unique_lock<std::mutex> lck(g_mtx);
54   if (instance_count_ == 0) {
55     MS_LOG(ERROR) << "No OpenCLRuntime instance could delete!";
56     return;
57   }
58   instance_count_--;
59   if (instance_count_ == 0) {
60     ocl_runtime_instance_->Uninit();
61   }
62 }
63 
printf_callback(const char * buffer,size_t length,size_t final,void * user_data)64 void printf_callback(const char *buffer, size_t length, size_t final, void *user_data) {
65   fwrite(buffer, 1, length, stdout);
66 }
67 
InitGPUDevice(std::vector<cl::Platform> * platforms)68 int OpenCLRuntime::InitGPUDevice(std::vector<cl::Platform> *platforms) {
69   MS_ASSERT(platforms);
70   // search GPU
71   std::vector<cl::Device> devices;
72   int ret = RET_OK;
73   for (auto &platform : *platforms) {
74     std::string platform_name;
75     ret = platform.getInfo(CL_PLATFORM_NAME, &platform_name);
76     if (ret != CL_SUCCESS) {
77       MS_LOG(WARNING) << CLErrorCode(ret);
78     }
79     ret = platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
80     if (ret != CL_SUCCESS) {
81       MS_LOG(WARNING) << CLErrorCode(ret);
82     }
83     MS_LOG(INFO) << "Platform (" << platform_name << ") has " << devices.size() << " GPUs";
84 
85     if (!devices.empty()) {
86       std::string device_name = devices[0].getInfo<CL_DEVICE_NAME>();
87       MS_LOG(INFO) << "Find GPU: " << device_name.c_str();
88       cl::Platform::setDefault(platform);
89       break;
90     }
91   }
92 
93   // not found, return error code.
94   if (devices.empty()) {
95     MS_LOG(ERROR) << "OpenCL Device not found!";
96     return RET_ERROR;
97   }
98 
99   device_ = new (std::nothrow) cl::Device();
100   if (device_ == nullptr) {
101     MS_LOG(ERROR) << "Create OpenCL device failed!";
102     return RET_ERROR;
103   }
104   *device_ = devices[0];
105   max_work_item_sizes_ = device_->getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>();
106   max_work_group_size_ = max_work_item_sizes_[0];
107   const std::string device_name = device_->getInfo<CL_DEVICE_NAME>();
108   const std::string device_version = device_->getInfo<CL_DEVICE_VERSION>();
109   const std::string opencl_version = device_->getInfo<CL_DEVICE_OPENCL_C_VERSION>();
110   clGetDeviceInfo((*device_)(), CL_DEVICE_IMAGE_PITCH_ALIGNMENT, sizeof(cl_uint), &image_pitch_align_, nullptr);
111   MS_LOG(INFO) << "Device name:\t" << device_name;
112   MS_LOG(INFO) << "Opencl version:\t" << device_version;
113   MS_LOG(INFO) << "Image pitch alignment:\t" << image_pitch_align_;
114   MS_LOG(INFO) << "Highest OpenCL c version:\t" << opencl_version;
115   MS_LOG(INFO) << "Max work item size:\t" << max_work_item_sizes_[0] << " : " << max_work_item_sizes_[1] << " : "
116                << max_work_item_sizes_[2];
117 
118   gpu_info_ = ParseGpuInfo(device_name, device_version);
119   // get cache size, compute units and frequency.
120   ret = device_->getInfo(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, &global_memery_cachesize_);
121   if (ret != CL_SUCCESS) {
122     MS_LOG(WARNING) << CLErrorCode(ret);
123   }
124   ret = device_->getInfo(CL_DEVICE_MAX_COMPUTE_UNITS, &compute_units_);
125   if (ret != CL_SUCCESS) {
126     MS_LOG(WARNING) << CLErrorCode(ret);
127   }
128   ret = device_->getInfo(CL_DEVICE_MAX_CLOCK_FREQUENCY, &max_freq_);
129   if (ret != CL_SUCCESS) {
130     MS_LOG(WARNING) << CLErrorCode(ret);
131   }
132   cl_device_fp_config fp_config;
133   auto success = device_->getInfo(CL_DEVICE_HALF_FP_CONFIG, &fp_config);
134   support_fp16_ = CL_SUCCESS == success && fp_config > 0;
135 
136   ret = device_->getInfo(CL_DEVICE_SVM_CAPABILITIES, &svm_capabilities_);
137   if (ret != CL_SUCCESS || svm_capabilities_ == 0) {
138     svm_capabilities_ = 0;
139     MS_LOG(INFO) << "SVM capalibilties: "
140                  << "NONE";
141   } else {
142     if (svm_capabilities_ & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) {
143       MS_LOG(INFO) << "SVM capalibilties: "
144                    << "SVM_FINE_GRAIN_BUFFER";
145     }
146     if (svm_capabilities_ & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) {
147       MS_LOG(INFO) << "SVM capalibilties: "
148                    << "SVM_COARSE_GRAIN_BUFFER";
149     }
150     if (svm_capabilities_ & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) {
151       MS_LOG(INFO) << "SVM capalibilties: "
152                    << "SVM_COARSE_GRAIN_SYSTEM";
153     }
154     if (svm_capabilities_ & CL_DEVICE_SVM_ATOMICS) {
155       MS_LOG(INFO) << "SVM capalibilties: "
156                    << "SVM_ATOMICS";
157     }
158   }
159   global_memery_size_ = device_->getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>();
160   max_alloc_size_ = device_->getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>();
161   max_image2d_width_ = device_->getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
162   max_image2d_height_ = device_->getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
163   supported_extensions_ = std::string(device_->getInfo<CL_DEVICE_EXTENSIONS>());
164   cache_line_size_ = device_->getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
165   MS_LOG(INFO) << "Address space bits: " << device_->getInfo<CL_DEVICE_ADDRESS_BITS>();
166   MS_LOG(INFO) << "Global Mem Size: " << global_memery_size_;
167   MS_LOG(INFO) << "Global Mem Cache Size: " << global_memery_cachesize_;
168   MS_LOG(INFO) << "Max Alloc Size: " << max_alloc_size_;
169   MS_LOG(INFO) << "Compute Unit: " << compute_units_;
170   MS_LOG(INFO) << "Clock Frequency: " << max_freq_ << " MHz";
171   return RET_OK;
172 }
173 
InitQueue(std::vector<cl::Platform> * platforms)174 int OpenCLRuntime::InitQueue(std::vector<cl::Platform> *platforms) {
175   MS_ASSERT(platforms);
176   cl_int ret = 0;
177 #if defined(CL_HPP_TARGET_OPENCL_VERSION) && (CL_HPP_TARGET_OPENCL_VERSION >= 120)
178   // create context from glcontext
179   if (this->GetGLTextureEnable()) {
180     // create context from glcontext
181     MS_LOG(INFO) << "Create special opencl context to share with OpenGL";
182 
183     if (!CheckGLContext()) {
184       MS_LOG(ERROR) << "GL Context error, please check glcontext config";
185       return RET_ERROR;
186     }
187     if (!CheckGLDisplay()) {
188       MS_LOG(ERROR) << "GL Display error, please check gldisplay config";
189       return RET_ERROR;
190     }
191 
192     cl_context_properties context_prop[] = {CL_GL_CONTEXT_KHR, (cl_context_properties)GetGLContext(),
193                                             CL_EGL_DISPLAY_KHR, (cl_context_properties)GetGLDisplay(), 0};
194     context_ = new (std::nothrow) cl::Context(std::vector<cl::Device>{*device_}, context_prop, nullptr, nullptr, &ret);
195     if (context_ == nullptr || ret != CL_SUCCESS) {
196       MS_LOG(ERROR)
197         << "Create special OpenCL context failed, The device unspport Sharing or OpenGL Context is not Init";
198       this->enable_gl_texture_ = false;
199       if (context_ != nullptr) {
200         delete context_;
201       }
202       return RET_ERROR;
203     }
204   } else {
205     context_ = new (std::nothrow) cl::Context(std::vector<cl::Device>{*device_}, nullptr, nullptr, nullptr, &ret);
206     if (context_ == nullptr) {
207       delete device_;
208       MS_LOG(ERROR) << "Create OpenCL context failed!";
209       return RET_ERROR;
210     }
211   }
212 #else
213   MS_LOG(INFO) << "Create common opencl context";
214 #ifdef Debug
215   std::vector<cl_context_properties> ctx_properties = {CL_CONTEXT_PLATFORM,
216                                                        (cl_context_properties)(*platforms)[0](),
217                                                        CL_PRINTF_CALLBACK_ARM,
218                                                        (cl_context_properties)printf_callback,
219                                                        CL_PRINTF_BUFFERSIZE_ARM,
220                                                        0x1000000,
221                                                        0};
222   context_ =
223     new (std::nothrow) cl::Context(std::vector<cl::Device>{*device_}, ctx_properties.data(), nullptr, nullptr, &ret);
224   if (context_ == nullptr || ret != CL_SUCCESS) {
225     context_ = new (std::nothrow) cl::Context(std::vector<cl::Device>{*device_}, nullptr, nullptr, nullptr, &ret);
226   }
227 #else
228   context_ = new (std::nothrow) cl::Context(std::vector<cl::Device>{*device_}, nullptr, nullptr, nullptr, &ret);
229 #endif
230 #endif
231   if (context_ == nullptr || ret != CL_SUCCESS) {
232     delete device_;
233     device_ = nullptr;
234     MS_LOG(ERROR) << "Context create failed: " << CLErrorCode(ret);
235     return RET_ERROR;
236   }
237 
238   default_command_queue_ = new (std::nothrow) cl::CommandQueue(*context_, *device_, 0, &ret);
239   if (default_command_queue_ == nullptr || ret != CL_SUCCESS) {
240     delete device_;
241     delete context_;
242     device_ = nullptr;
243     context_ = nullptr;
244     MS_LOG(ERROR) << "Command Queue create failed: " << CLErrorCode(ret);
245     return RET_ERROR;
246   }
247 
248   profiling_command_queue_ = new (std::nothrow) cl::CommandQueue(*context_, *device_, CL_QUEUE_PROFILING_ENABLE, &ret);
249   if (profiling_command_queue_ == nullptr || ret != CL_SUCCESS) {
250     delete device_;
251     delete context_;
252     delete default_command_queue_;
253     device_ = nullptr;
254     context_ = nullptr;
255     default_command_queue_ = nullptr;
256     MS_LOG(ERROR) << "Profiling command Queue create failed: " << CLErrorCode(ret);
257     return RET_ERROR;
258   }
259   return RET_OK;
260 }
261 
262 // Init will get platforms info, get devices info, create opencl context.
Init()263 int OpenCLRuntime::Init() {
264   std::unique_lock<std::mutex> lck(g_init_mtx);
265   if (init_state_ == InitSuccess) {
266     return RET_OK;
267   } else if (init_state_ == InitFailed) {
268     return RET_ERROR;
269   }
270   init_state_ = InitFailed;
271 
272   MS_LOG(INFO) << "OpenCL version: CL_TARGET_OPENCL_VERSION " << CL_TARGET_OPENCL_VERSION;
273   MS_LOG(INFO) << "CL_HPP_TARGET_OPENCL_VERSION " << CL_HPP_TARGET_OPENCL_VERSION;
274   MS_LOG(INFO) << "CL_HPP_MINIMUM_OPENCL_VERSION " << CL_HPP_MINIMUM_OPENCL_VERSION;
275 
276 #ifdef USE_OPENCL_WRAPPER
277   if (!lite::opencl::LoadOpenCLLibrary(&handle_)) {
278     MS_LOG(ERROR) << "Load OpenCL symbols failed!";
279     return RET_ERROR;
280   }
281 #endif  // USE_OPENCL_WRAPPER
282   std::vector<cl::Platform> platforms;
283   cl_int ret = cl::Platform::get(&platforms);
284   if (platforms.empty()) {
285     MS_LOG(ERROR) << "OpenCL Platform not found!" << CLErrorCode(ret);
286     return RET_ERROR;
287   }
288   auto ms_ret = InitGPUDevice(&platforms);
289   if (ms_ret != RET_OK) {
290     return ms_ret;
291   }
292 
293 #ifdef __ANDROID__
294   // only support mali device.
295   if (gpu_info_.type == MALI || gpu_info_.type == MALI_T || gpu_info_.type == MALI_G || gpu_info_.type == MALI_G78) {
296     clImportMemoryARM = reinterpret_cast<clImportMemoryARMFunc>(dlsym(handle_, "clImportMemoryARM"));
297     if (clImportMemoryARM == nullptr) {
298       MS_LOG(ERROR) << "load func (clImportMemoryARM) failed!";
299       UnLoadOpenCLLibrary(handle_);
300       return false;
301     }
302   }
303 
304   // only gltexture enable, load clCreateFromGLTexture func
305   if (this->GetGLTextureEnable()) {
306     clCreateFromGLTexture = reinterpret_cast<clCreateFromGLTextureFunc>(dlsym(handle_, "clCreateFromGLTexture"));
307     if (clCreateFromGLTexture == nullptr) {
308       MS_LOG(ERROR) << "load func (clCreateFromGLTexture) failed!";
309       UnLoadOpenCLLibrary(handle_);
310       return false;
311     }
312   }
313 #endif
314 
315   ms_ret = InitQueue(&platforms);
316   if (ms_ret != RET_OK) {
317     return ms_ret;
318   }
319 
320   allocator_ = std::make_shared<OpenCLAllocator>(this);
321   if (allocator_ == nullptr) {
322     delete device_;
323     delete context_;
324     delete default_command_queue_;
325     delete profiling_command_queue_;
326     device_ = nullptr;
327     context_ = nullptr;
328     default_command_queue_ = nullptr;
329     profiling_command_queue_ = nullptr;
330     MS_LOG(ERROR) << "Command OpenCL allocator failed!";
331     return RET_ERROR;
332   }
333   LoadCache();
334   init_state_ = InitSuccess;
335   MS_LOG(INFO) << "OpenCLRuntime init done!";
336   return RET_OK;
337 }
338 
Uninit()339 int OpenCLRuntime::Uninit() {
340   std::unique_lock<std::mutex> lck(g_init_mtx);
341   if (init_state_ != InitSuccess) {
342     return RET_OK;
343   }
344   if (StoreCache() != RET_OK) {
345     MS_LOG(ERROR) << "StoreCache failed!";
346   }
347   program_map_.clear();
348   delete default_command_queue_;
349   delete profiling_command_queue_;
350   delete context_;
351   delete device_;
352   allocator_ = nullptr;
353   default_command_queue_ = nullptr;
354   profiling_command_queue_ = nullptr;
355   context_ = nullptr;
356   device_ = nullptr;
357   init_state_ = UnInit;
358   return RET_OK;
359 }
360 
~OpenCLRuntime()361 OpenCLRuntime::~OpenCLRuntime() { Uninit(); }
362 
Context()363 cl::Context *OpenCLRuntime::Context() { return context_; }
364 
Device()365 cl::Device *OpenCLRuntime::Device() { return device_; }
366 
DeviceGlobalMemoryCacheSize() const367 uint64_t OpenCLRuntime::DeviceGlobalMemoryCacheSize() const { return global_memery_cachesize_; }
368 
DeviceMaxWorkGroupSize() const369 uint64_t OpenCLRuntime::DeviceMaxWorkGroupSize() const { return max_work_group_size_; }
370 
DeviceComputeUnits() const371 uint32_t OpenCLRuntime::DeviceComputeUnits() const { return compute_units_; }
372 
DeviceMaxFreq() const373 uint32_t OpenCLRuntime::DeviceMaxFreq() const { return max_freq_; }
374 
375 // get kernel enqueue max work group size
GetMaxWorkGroupSize(const cl::Kernel & kernel)376 uint64_t OpenCLRuntime::GetMaxWorkGroupSize(const cl::Kernel &kernel) {
377   uint64_t max_workgroup_size = 0;
378   int ret = kernel.getWorkGroupInfo(*device_, CL_KERNEL_WORK_GROUP_SIZE, &max_workgroup_size);
379   if (ret != CL_SUCCESS) {
380     max_workgroup_size = 0;
381   }
382   return max_workgroup_size;
383 }
384 
385 // opencl 2.0 can get SubGroupSize.
GetSubGroupSize(const cl::Kernel & kernel,const cl::NDRange & range)386 uint32_t OpenCLRuntime::GetSubGroupSize(const cl::Kernel &kernel, const cl::NDRange &range) {
387   uint32_t sub_group_size = 0;
388 
389   if (ADRENO == gpu_info_.type) {
390 #if defined(CL_HPP_TARGET_OPENCL_VERSION) && CL_HPP_TARGET_OPENCL_VERSION >= 200 && \
391   defined(CL_TARGET_OPENCL_VERSION) && CL_TARGET_OPENCL_VERSION >= 210 && defined(CL_HPP_USE_CL_SUB_GROUPS_KHR)
392     cl_int cl_ret;
393     sub_group_size = kernel.getSubGroupInfo<CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE>(*device_, range, &cl_ret);
394     if (cl_ret != CL_SUCCESS) {
395       CHECK_CL_SUCCESS(cl_ret)
396       sub_group_size = 0;
397     }
398 #else
399     sub_group_size = 0;
400 #endif
401   }
402 
403   return sub_group_size;
404 }
405 
GetGpuInfo()406 GpuInfo OpenCLRuntime::GetGpuInfo() { return gpu_info_; }
407 
GetFp16Enable() const408 bool OpenCLRuntime::GetFp16Enable() const { return fp16_enable_; }
409 
410 // if support fp16, set fp16 will success.
SetFp16Enable(bool enable)411 bool OpenCLRuntime::SetFp16Enable(bool enable) {
412   fp16_enable_ = enable && support_fp16_;
413   return fp16_enable_ == enable;
414 }
415 
SetGLTextureEnable(bool enable)416 bool OpenCLRuntime::SetGLTextureEnable(bool enable) {
417   enable_gl_texture_ = enable;
418   return enable_gl_texture_ == enable;
419 }
420 
GetGLTextureEnable() const421 bool OpenCLRuntime::GetGLTextureEnable() const { return enable_gl_texture_; }
422 
BuildKernel(const cl::Kernel & kernel,const std::string & program_name,const std::string & kernel_name,const std::vector<std::string> & build_options_ext,const bool is_builtin)423 int OpenCLRuntime::BuildKernel(const cl::Kernel &kernel, const std::string &program_name,
424                                const std::string &kernel_name, const std::vector<std::string> &build_options_ext,
425                                const bool is_builtin) {
426   std::string build_option;
427   if (is_builtin) {
428     build_option = default_build_option_;
429     if (fp16_enable_) {
430       build_option +=
431         " -DFP16_ENABLE=1 -DFLT=half -DFLT4=half4 -DFLT16=half16 -DAS_FLT4=as_half4 -DAS_UINT4=as_ushort4 "
432         "-DUINT4=ushort4"
433         " -DTO_FLT=convert_half -DTO_FLT4=convert_half4";
434     } else {
435       build_option +=
436         " -DFP16_ENABLE=0 -DFLT=float -DFLT4=float4 -DFLT16=float16 -DAS_FLT4=as_float4 -DAS_UINT4=as_uint4 "
437         "-DUINT4=uint4"
438         " -DTO_FLT=convert_float -DTO_FLT4=convert_float4";
439     }
440     build_option += " -DMAX_IMAGE2D_WIDTH=" + std::to_string(max_image2d_width_);
441   }
442   build_option =
443     std::accumulate(build_options_ext.begin(), build_options_ext.end(), build_option,
444                     [](const std::string &options, const std::string &option) { return options + " " + option; });
445 
446   cl::Program program;
447   auto program_key = std::make_pair(program_name, build_option);
448   auto iter = program_map_.find(program_key);
449   if (iter != program_map_.end()) {
450     program = iter->second;
451   } else {
452     flush_cache_ = true;
453     auto status = this->LoadProgram(program_name, &program);
454     if (!status) {
455       MS_LOG(ERROR) << "load program (" << program_name << ") failed!";
456       return RET_ERROR;
457     }
458     status = this->BuildProgram(build_option, program);
459     if (!status) {
460       MS_LOG(ERROR) << program_name << " build failed!";
461       return RET_ERROR;
462     }
463     program_map_.emplace(program_key, program);
464   }
465 
466   cl_int ret;
467   const_cast<cl::Kernel &>(kernel) = cl::Kernel(program, kernel_name.c_str(), &ret);
468   if (ret != CL_SUCCESS) {
469     MS_LOG(ERROR) << kernel_name << " Kernel create failed:" << CLErrorCode(ret);
470     return RET_ERROR;
471   }
472   return RET_OK;
473 }
474 
475 // Run Kernel with 1D, 2D, 3D group size, and local size can be empty.
RunKernel(const cl::Kernel & kernel,const cl::NDRange & global,const cl::NDRange & local,cl::CommandQueue * command_queue,cl::Event * event)476 int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const cl::NDRange &global, const cl::NDRange &local,
477                              cl::CommandQueue *command_queue, cl::Event *event) {
478   if (command_queue == nullptr) {
479     if (profiling_) {
480       command_queue = profiling_command_queue_;
481     } else {
482       command_queue = default_command_queue_;
483     }
484   }
485   MS_ASSERT(local.size() == 0 || local.size() == global.size());
486   cl_int ret = CL_SUCCESS;
487   ret = command_queue->enqueueNDRangeKernel(kernel, cl::NullRange, global, local, nullptr, event);
488   if (ret != CL_SUCCESS) {
489     MS_LOG(ERROR) << "Kernel execute failed:" << CLErrorCode(ret);
490     return RET_ERROR;
491   }
492   static int cnt = 0;
493   const int flush_period = 10;
494   if (MALI_G78 == gpu_info_.type) {
495     auto flush_ret = command_queue->flush();
496     if (flush_ret != CL_SUCCESS) {
497       MS_LOG(WARNING) << "CL Flush failed:" << CLErrorCode(ret);
498     }
499   } else {
500     if (cnt % flush_period == 0) {
501       auto flush_ret = command_queue->flush();
502       if (flush_ret != CL_SUCCESS) {
503         MS_LOG(WARNING) << "CL Flush failed:" << CLErrorCode(ret);
504       }
505     }
506   }
507 
508   cnt++;
509   MS_LOG(DEBUG) << "RunKernel success!";
510   if (profiling_) {
511     event->wait();
512   }
513   return RET_OK;
514 }
515 
516 // get gpu divce type
ParseGpuInfo(std::string device_name,std::string device_version)517 GpuInfo OpenCLRuntime::ParseGpuInfo(std::string device_name, std::string device_version) {
518   GpuInfo info;
519   if (device_name == "QUALCOMM Adreno(TM)") {
520     info.type = ADRENO;
521   } else if (device_name.find("Mali") != std::string::npos) {
522     info.type = MALI;
523     // Mali type MALI-G or MALI_T
524     if (device_name.find("Mali-G") != std::string::npos) {
525       info.type = MALI_G;
526       if (device_name.find("Mali-G78") != std::string::npos) {
527         info.type = MALI_G78;
528       }
529     } else if (device_name.find("Mali-T") != std::string::npos) {
530       info.type = MALI_T;
531     }
532   }
533   return info;
534 }
535 
LoadSource(const std::string & program_name,const std::string & source)536 bool OpenCLRuntime::LoadSource(const std::string &program_name, const std::string &source) {
537   auto it_source = g_source_map.find(program_name);
538   if (it_source == g_source_map.end()) {
539     g_source_map.emplace(program_name, source);
540   }
541   return true;
542 }
543 
544 // load program with program name.
LoadProgram(const std::string & program_name,cl::Program * program)545 bool OpenCLRuntime::LoadProgram(const std::string &program_name, cl::Program *program) {
546   auto it_source = g_source_map.find(program_name);
547   if (it_source != g_source_map.end()) {
548     cl::Program::Sources sources;
549     sources.push_back(it_source->second);
550     *program = cl::Program(*context_, sources);
551     return true;
552   } else {
553     MS_LOG(ERROR) << "Can't find kernel source !";
554     return false;
555   }
556 }
557 
558 // build program with build options
BuildProgram(const std::string & build_option,const cl::Program & program)559 bool OpenCLRuntime::BuildProgram(const std::string &build_option, const cl::Program &program) {
560   cl_int ret = program.build({*device_}, build_option.c_str());
561   if (ret != CL_SUCCESS) {
562     if (program.getBuildInfo<CL_PROGRAM_BUILD_STATUS>(*device_) == CL_BUILD_ERROR) {
563       std::string build_log = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(*device_);
564       MS_LOG(ERROR) << "Program build log: " << build_log;
565     }
566     MS_LOG(ERROR) << "Build program failed: " << CLErrorCode(ret);
567     return false;
568   }
569   return true;
570 }
571 
ReadOrWriteImage(void * buffer,void * data,bool is_read)572 int OpenCLRuntime::ReadOrWriteImage(void *buffer, void *data, bool is_read) {
573   cl::CommandQueue *command_queue = profiling_ ? profiling_command_queue_ : default_command_queue_;
574   auto *image = allocator_->GetImage(buffer);
575   if (image == nullptr) {
576     MS_LOG(WARNING) << "Can't get Image2D for " << buffer;
577     return RET_ERROR;
578   }
579   ImageSize img_size;
580   int ret = allocator_->GetImageSize(buffer, &img_size);
581   if (ret != RET_OK) {
582     MS_LOG(WARNING) << "Can't get GetImageSize for " << buffer;
583     return RET_ERROR;
584   }
585   cl::array<size_t, 3> origin = {0, 0, 0};
586   cl::array<size_t, 3> region = {img_size.width, img_size.height, 1};
587   if (is_read) {
588     ret = command_queue->enqueueReadImage(*image, true, origin, region, 0, 0, data, nullptr, nullptr);
589   } else {
590     ret = command_queue->enqueueWriteImage(*image, true, origin, region, 0, 0, data, nullptr, nullptr);
591   }
592   if (ret != CL_SUCCESS) {
593     MS_LOG(ERROR) << CLErrorCode(ret);
594     return RET_ERROR;
595   }
596   return RET_OK;
597 }
598 
ReadImage(void * buffer,void * dst_data)599 int OpenCLRuntime::ReadImage(void *buffer, void *dst_data) { return ReadOrWriteImage(buffer, dst_data, true); }
600 
WriteImage(void * buffer,void * src_data)601 int OpenCLRuntime::WriteImage(void *buffer, void *src_data) { return ReadOrWriteImage(buffer, src_data, false); }
602 
CopyDeviceMemToHost(void * dst,const void * src,size_t size,cl::CommandQueue * command_queue,bool sync) const603 bool OpenCLRuntime::CopyDeviceMemToHost(void *dst, const void *src, size_t size, cl::CommandQueue *command_queue,
604                                         bool sync) const {
605   if (command_queue == nullptr) {
606     command_queue = default_command_queue_;
607   }
608   cl_int cl_ret = CL_SUCCESS;
609   const cl::Buffer *buffer = static_cast<const cl::Buffer *>(src);
610   if (command_queue != nullptr) {
611     cl_ret = command_queue->enqueueReadBuffer(*buffer, sync, 0, size, dst);
612   }
613   return cl_ret == CL_SUCCESS;
614 }
615 
CopyHostMemToDevice(const void * dst,const void * src,size_t size,cl::CommandQueue * command_queue,bool sync) const616 bool OpenCLRuntime::CopyHostMemToDevice(const void *dst, const void *src, size_t size, cl::CommandQueue *command_queue,
617                                         bool sync) const {
618   if (command_queue == nullptr) {
619     command_queue = default_command_queue_;
620   }
621   cl_int cl_ret = CL_SUCCESS;
622   const cl::Buffer *buffer = static_cast<const cl::Buffer *>(dst);
623   if (command_queue != nullptr) {
624     cl_ret = command_queue->enqueueWriteBuffer(*buffer, sync, 0, size, src);
625   }
626   return cl_ret == CL_SUCCESS;
627 }
628 
MapBuffer(const cl::Buffer & buffer,int flags,size_t size,cl::CommandQueue * command_queue,bool sync) const629 void *OpenCLRuntime::MapBuffer(const cl::Buffer &buffer, int flags, size_t size, cl::CommandQueue *command_queue,
630                                bool sync) const {
631   if (command_queue == nullptr) {
632     command_queue = default_command_queue_;
633   }
634   return command_queue->enqueueMapBuffer(buffer, sync, flags, 0, size);
635 }
636 
MapBuffer(void * host_ptr,int flags,size_t size,cl::CommandQueue * command_queue,bool sync) const637 int OpenCLRuntime::MapBuffer(void *host_ptr, int flags, size_t size, cl::CommandQueue *command_queue, bool sync) const {
638   if (GetSVMCapabilities() & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) {
639     return RET_ERROR;
640   }
641   if (command_queue == nullptr) {
642     command_queue = default_command_queue_;
643   }
644   if (clEnqueueSVMMap(command_queue->get(), sync, flags, host_ptr, size, 0, nullptr, nullptr) != CL_SUCCESS) {
645     return RET_ERROR;
646   }
647   return RET_OK;
648 }
649 
MapBuffer(const cl::Image2D & buffer,bool sync,int flags,const std::vector<size_t> & region,cl::CommandQueue * command_queue) const650 void *OpenCLRuntime::MapBuffer(const cl::Image2D &buffer, bool sync, int flags, const std::vector<size_t> &region,
651                                cl::CommandQueue *command_queue) const {
652   if (command_queue == nullptr) {
653     command_queue = default_command_queue_;
654   }
655   cl::size_type row_pitch;
656   cl::size_type slice_pitch;
657   cl::array<cl::size_type, 3> origin_{0, 0, 0};
658   cl::array<cl::size_type, 3> region_{region[0], region[1], region[2]};
659   return command_queue->enqueueMapImage(buffer, sync, flags, origin_, region_, &row_pitch, &slice_pitch);
660 }
661 
UnmapBuffer(const cl::Memory & buffer,void * host_ptr,cl::CommandQueue * command_queue) const662 int OpenCLRuntime::UnmapBuffer(const cl::Memory &buffer, void *host_ptr, cl::CommandQueue *command_queue) const {
663   if (command_queue == nullptr) {
664     command_queue = default_command_queue_;
665   }
666   return command_queue->enqueueUnmapMemObject(buffer, host_ptr);
667 }
668 
UnmapBuffer(void * host_ptr,cl::CommandQueue * command_queue) const669 int OpenCLRuntime::UnmapBuffer(void *host_ptr, cl::CommandQueue *command_queue) const {
670   if (GetSVMCapabilities() & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) {
671     return RET_OK;
672   }
673   if (command_queue == nullptr) {
674     command_queue = default_command_queue_;
675   }
676   return clEnqueueSVMUnmap(command_queue->get(), host_ptr, 0, nullptr, nullptr);
677 }
678 
SyncCommandQueue(cl::CommandQueue * command_queue)679 bool OpenCLRuntime::SyncCommandQueue(cl::CommandQueue *command_queue) {
680   if (command_queue == nullptr) {
681     command_queue = default_command_queue_;
682   }
683   cl_int ret = command_queue->finish();
684   if (ret != CL_SUCCESS) {
685     MS_LOG(ERROR) << "Command queue sync failed: " << CLErrorCode(ret);
686     return RET_ERROR;
687   }
688   return ret == CL_SUCCESS;
689 }
690 
GetKernelMaxWorkGroupSize(cl_kernel kernel,cl_device_id device_id)691 int OpenCLRuntime::GetKernelMaxWorkGroupSize(cl_kernel kernel, cl_device_id device_id) {
692   size_t max_work_group_size;
693   cl_int ret = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t),
694                                         &max_work_group_size, nullptr);
695   if (ret != CL_SUCCESS) {
696     MS_LOG(ERROR) << "Failed to get info CL_KERNEL_WORK_GROUP_SIZE " << CLErrorCode(ret);
697   }
698   return static_cast<int>(max_work_group_size);
699 }
700 
GetKernelFromBinary(const std::string & kernel_name)701 cl::Kernel OpenCLRuntime::GetKernelFromBinary(const std::string &kernel_name) {
702   cl_int ret = CL_SUCCESS;
703   cl::Kernel kernel = cl::Kernel(binary_program_, kernel_name.c_str(), &ret);
704   if (ret != CL_SUCCESS) {
705     MS_LOG(ERROR) << "Create kernel with binary program failed: " << CLErrorCode(ret);
706   }
707   return kernel;
708 }
709 
710 // build program with IL
CreateProgramFromIL(const std::vector<char> & binary,const std::string & flag)711 cl::Program OpenCLRuntime::CreateProgramFromIL(const std::vector<char> &binary, const std::string &flag) {
712 #if defined(CL_HPP_TARGET_OPENCL_VERSION) && CL_HPP_TARGET_OPENCL_VERSION >= 210
713   cl::Program program = cl::Program(*context_, binary);
714   bool status = BuildProgram(default_build_opts_, program);
715   if (!status) {
716     MS_LOG(ERROR) << "Build program with IL failed!";
717   }
718   return program;
719 #else
720   MS_LOG(ERROR) << "Create program with IL failed! The compute capabitity of device should be 2.1 and higher.";
721   return cl::Program();
722 #endif
723 }
724 
725 // build program with binary
CreateProgramFromBinary(const std::vector<unsigned char> & binary,const std::string & build_option)726 cl::Program OpenCLRuntime::CreateProgramFromBinary(const std::vector<unsigned char> &binary,
727                                                    const std::string &build_option) {
728   cl::Program program = cl::Program(*context_, {*device_}, {binary});
729   bool status = BuildProgram(build_option, program);
730   if (!status) {
731     MS_LOG(ERROR) << "Build program with binary failed!";
732   }
733   return program;
734 }
735 
GetProgramBinary(const cl::Program & program)736 std::vector<unsigned char> OpenCLRuntime::GetProgramBinary(const cl::Program &program) {
737   cl_int ret = CL_SUCCESS;
738   auto binarys = program.getInfo<CL_PROGRAM_BINARIES>(&ret);
739   if (ret != CL_SUCCESS) {
740     MS_LOG(ERROR) << "Get program binary failed: " << CLErrorCode(ret);
741   }
742   if (binarys.empty()) {
743     MS_LOG(ERROR) << "binarys is empty";
744     return {};
745   }
746   return binarys.front();
747 }
748 
LoadCache()749 void OpenCLRuntime::LoadCache() {
750   if (!enable_cache_) {
751     return;
752   }
753   size_t len;
754   std::unique_ptr<char[]> buf(lite::ReadFile(cache_path_.c_str(), &len));
755   if (buf == nullptr) {
756     MS_LOG(ERROR) << "Load opencl cache fail: buf == nullptr";
757     return;
758   }
759   auto gpu_cache = schema::GetGpuCache(buf.get());
760   if (gpu_cache == nullptr) {
761     MS_LOG(ERROR) << "Load opencl cache fail: gpu_cache == nullptr";
762     return;
763   }
764   auto *bins = gpu_cache->allBins();
765   if (bins == nullptr) {
766     MS_LOG(ERROR) << "Load opencl cache fail: bins == nullptr";
767     return;
768   }
769   for (size_t i = 0; i < bins->size(); ++i) {
770     auto *bin = bins->template GetAs<schema::ProgramBinary>(i);
771     if (bin == nullptr) {
772       MS_LOG(ERROR) << "kernel_bin[" << i << "] null";
773       return;
774     }
775     auto *pdata = bin->data();
776     MS_ASSERT(pdata);
777     if (pdata->size() == 0) {
778       continue;
779     }
780     std::vector<unsigned char> binary(pdata->begin(), pdata->end());
781     auto program = CreateProgramFromBinary(binary, bin->build_option()->str());
782     program_map_.emplace(std::make_pair(bin->program_name()->str(), bin->build_option()->str()), program);
783     MS_LOG(INFO) << "LoadCache " << bin->program_name() << " success, size=" << binary.size();
784   }
785   MS_LOG(INFO) << "Init opencl cache success";
786 }
787 
StoreCache()788 int OpenCLRuntime::StoreCache() {
789   if (!enable_cache_) {
790     return RET_OK;
791   }
792   if (!flush_cache_) {
793     return RET_OK;
794   }
795   auto fbb = std::make_unique<flatbuffers::FlatBufferBuilder>();
796   if (fbb == nullptr) {
797     MS_LOG(ERROR) << "new opencl FlatBufferBuilder fail";
798     return RET_ERROR;
799   }
800   std::vector<flatbuffers::Offset<schema::ProgramBinary>> program_binarys;
801   for (const auto &kv : program_map_) {
802     auto program_name = kv.first.first;
803     auto build_option = kv.first.second;
804     cl::Program program = kv.second;
805     auto binary = this->GetProgramBinary(program);
806     std::vector<int32_t> shape;
807     auto tune = schema::CreateTuneParam(*fbb, fbb->CreateVector<int32_t>(shape), fbb->CreateVector<int32_t>(shape),
808                                         fbb->CreateVector<int32_t>(shape), fbb->CreateVector<int32_t>(shape));
809     auto program_binary = schema::CreateProgramBinary(
810       *fbb, fbb->CreateString(program_name), fbb->CreateString(build_option), tune, fbb->CreateVector<uint8_t>(binary));
811     program_binarys.emplace_back(program_binary);
812     MS_LOG(INFO) << "StoreCache " << program_name << " success, size=" << binary.size();
813   }
814 
815   auto data = fbb->CreateVector<flatbuffers::Offset<schema::ProgramBinary>>(program_binarys);
816   auto name = fbb->CreateString("OpenCLCache");
817   auto version = fbb->CreateString(cache_version_);
818   auto gpu_cache = schema::CreateGpuCache(*fbb, name, version, data);
819   fbb->Finish(gpu_cache);
820   uint8_t *buf = fbb->GetBufferPointer();
821   if (WriteToBin(cache_path_, reinterpret_cast<void *>(buf), fbb->GetSize()) != RET_OK) {
822     MS_LOG(ERROR) << "WriteToBin failed.";
823     return RET_ERROR;
824   }
825   MS_LOG(INFO) << "store opencl cache ok, size=" << fbb->GetSize();
826   return RET_OK;
827 }
828 
CreateSharedMemoryBuffer(size_t size,void * host_ptr)829 cl::Buffer *OpenCLRuntime::CreateSharedMemoryBuffer(size_t size, void *host_ptr) {
830   cl_int error = CL_SUCCESS;
831   cl_mem cl_buffer = clImportMemoryARM(context_->get(), CL_MEM_READ_WRITE, NULL, host_ptr, size, &error);
832   if (error != CL_SUCCESS) {
833     MS_LOG(ERROR) << "Create OpenCL shared memory failed for" << CLErrorCode(error);
834     return nullptr;
835   }
836   cl::Buffer *buffer = new (std::nothrow) cl::Buffer(cl_buffer, false);
837   if (buffer == nullptr) {
838     MS_LOG(ERROR) << "New OpenCL Buffer failed";
839     return nullptr;
840   }
841   return buffer;
842 }
843 }  // namespace mindspore::lite::opencl
844