• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2019 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "src/runtime/gpu/opencl/opencl_runtime.h"
18 #include <dlfcn.h>
19 #ifdef SHARING_MEM_WITH_OPENGL
20 #include <EGL/egl.h>
21 #endif
22 #include <vector>
23 #include <numeric>
24 #include <utility>
25 #include "include/errorcode.h"
26 #include "src/runtime/kernel/opencl/utils.h"
27 #include "src/runtime/gpu/opencl/opencl_allocator.h"
28 #include "src/common/file_utils.h"
29 
30 using mindspore::kernel::CLErrorCode;
31 
32 namespace mindspore::lite::opencl {
33 static std::map<std::string, std::string> g_source_map;
34 static std::mutex g_mtx;
35 static std::mutex g_init_mtx;
36 
37 InitState OpenCLRuntime::init_state_ = UnInit;
38 OpenCLRuntime *OpenCLRuntime::ocl_runtime_instance_ = nullptr;
39 size_t OpenCLRuntime::instance_count_ = 0;
40 
GetInstance()41 OpenCLRuntime *OpenCLRuntime::GetInstance() {
42   std::unique_lock<std::mutex> lck(g_mtx);
43   static OpenCLRuntime ocl_runtime;
44   if (instance_count_ == 0) {
45     ocl_runtime_instance_ = &ocl_runtime;
46     ocl_runtime_instance_->Init();
47   }
48   instance_count_++;
49   return ocl_runtime_instance_;
50 }
51 
DeleteInstance()52 void OpenCLRuntime::DeleteInstance() {
53   std::unique_lock<std::mutex> lck(g_mtx);
54   if (instance_count_ == 0) {
55     MS_LOG(ERROR) << "No OpenCLRuntime instance could delete!";
56     return;
57   }
58   instance_count_--;
59   if (instance_count_ == 0) {
60     ocl_runtime_instance_->Uninit();
61   }
62 }
63 
printf_callback(const char * buffer,size_t length,size_t final,void * user_data)64 void printf_callback(const char *buffer, size_t length, size_t final, void *user_data) {
65   fwrite(buffer, 1, length, stdout);
66 }
67 
InitGPUDevice(std::vector<cl::Platform> * platforms)68 int OpenCLRuntime::InitGPUDevice(std::vector<cl::Platform> *platforms) {
69   MS_ASSERT(platforms);
70   // search GPU
71   std::vector<cl::Device> devices;
72   int ret = RET_OK;
73   for (auto &platform : *platforms) {
74     std::string platform_name;
75     ret = platform.getInfo(CL_PLATFORM_NAME, &platform_name);
76     if (ret != CL_SUCCESS) {
77       MS_LOG(WARNING) << CLErrorCode(ret);
78     }
79     ret = platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
80     if (ret != CL_SUCCESS) {
81       MS_LOG(WARNING) << CLErrorCode(ret);
82     }
83     MS_LOG(INFO) << "Platform (" << platform_name << ") has " << devices.size() << " GPUs";
84 
85     if (!devices.empty()) {
86       std::string device_name = devices[0].getInfo<CL_DEVICE_NAME>();
87       MS_LOG(INFO) << "Find GPU: " << device_name.c_str();
88       cl::Platform::setDefault(platform);
89       break;
90     }
91   }
92 
93   // not found, return error code.
94   if (devices.empty()) {
95     MS_LOG(ERROR) << "OpenCL Device not found!";
96     return RET_ERROR;
97   }
98 
99   device_ = new (std::nothrow) cl::Device();
100   if (device_ == nullptr) {
101     MS_LOG(ERROR) << "Create OpenCL device failed!";
102     return RET_ERROR;
103   }
104   *device_ = devices[0];
105   max_work_item_sizes_ = device_->getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>();
106   max_work_group_size_ = max_work_item_sizes_[0];
107   const std::string device_name = device_->getInfo<CL_DEVICE_NAME>();
108   const std::string device_version = device_->getInfo<CL_DEVICE_VERSION>();
109   const std::string opencl_version = device_->getInfo<CL_DEVICE_OPENCL_C_VERSION>();
110   clGetDeviceInfo((*device_)(), CL_DEVICE_IMAGE_PITCH_ALIGNMENT, sizeof(cl_uint), &image_pitch_align_, nullptr);
111   MS_LOG(INFO) << "Device name:\t" << device_name;
112   MS_LOG(INFO) << "Opencl version:\t" << device_version;
113   MS_LOG(INFO) << "Image pitch alignment:\t" << image_pitch_align_;
114   MS_LOG(INFO) << "Highest OpenCL c version:\t" << opencl_version;
115   MS_LOG(INFO) << "Max work item size:\t" << max_work_item_sizes_[0] << " : " << max_work_item_sizes_[1] << " : "
116                << max_work_item_sizes_[2];
117 
118   gpu_info_ = ParseGpuInfo(device_name, device_version);
119   // get cache size, compute units and frequency.
120   ret = device_->getInfo(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, &global_memery_cachesize_);
121   if (ret != CL_SUCCESS) {
122     MS_LOG(WARNING) << CLErrorCode(ret);
123   }
124   ret = device_->getInfo(CL_DEVICE_MAX_COMPUTE_UNITS, &compute_units_);
125   if (ret != CL_SUCCESS) {
126     MS_LOG(WARNING) << CLErrorCode(ret);
127   }
128   ret = device_->getInfo(CL_DEVICE_MAX_CLOCK_FREQUENCY, &max_freq_);
129   if (ret != CL_SUCCESS) {
130     MS_LOG(WARNING) << CLErrorCode(ret);
131   }
132   cl_device_fp_config fp_config;
133   auto success = device_->getInfo(CL_DEVICE_HALF_FP_CONFIG, &fp_config);
134   support_fp16_ = CL_SUCCESS == success && fp_config > 0;
135 
136   ret = device_->getInfo(CL_DEVICE_SVM_CAPABILITIES, &svm_capabilities_);
137   if (ret != CL_SUCCESS || svm_capabilities_ == 0) {
138     svm_capabilities_ = 0;
139     MS_LOG(INFO) << "SVM capalibilties: "
140                  << "NONE";
141   } else {
142     if (svm_capabilities_ & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) {
143       MS_LOG(INFO) << "SVM capalibilties: "
144                    << "SVM_FINE_GRAIN_BUFFER";
145     }
146     if (svm_capabilities_ & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) {
147       MS_LOG(INFO) << "SVM capalibilties: "
148                    << "SVM_COARSE_GRAIN_BUFFER";
149     }
150     if (svm_capabilities_ & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) {
151       MS_LOG(INFO) << "SVM capalibilties: "
152                    << "SVM_COARSE_GRAIN_SYSTEM";
153     }
154     if (svm_capabilities_ & CL_DEVICE_SVM_ATOMICS) {
155       MS_LOG(INFO) << "SVM capalibilties: "
156                    << "SVM_ATOMICS";
157     }
158   }
159   global_memery_size_ = device_->getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>();
160   max_alloc_size_ = device_->getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>();
161   max_image2d_width_ = device_->getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
162   max_image2d_height_ = device_->getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
163   supported_extensions_ = std::string(device_->getInfo<CL_DEVICE_EXTENSIONS>());
164   cache_line_size_ = device_->getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
165   MS_LOG(INFO) << "Address space bits: " << device_->getInfo<CL_DEVICE_ADDRESS_BITS>();
166   MS_LOG(INFO) << "Global Mem Size: " << global_memery_size_;
167   MS_LOG(INFO) << "Global Mem Cache Size: " << global_memery_cachesize_;
168   MS_LOG(INFO) << "Max Alloc Size: " << max_alloc_size_;
169   MS_LOG(INFO) << "Compute Unit: " << compute_units_;
170   MS_LOG(INFO) << "Clock Frequency: " << max_freq_ << " MHz";
171   return RET_OK;
172 }
173 
InitQueue(std::vector<cl::Platform> * platforms)174 int OpenCLRuntime::InitQueue(std::vector<cl::Platform> *platforms) {
175   MS_ASSERT(platforms);
176   cl_int ret;
177 #if defined(SHARING_MEM_WITH_OPENGL) && defined(CL_HPP_TARGET_OPENCL_VERSION) && (CL_HPP_TARGET_OPENCL_VERSION >= 120)
178   // create context from glcontext
179   MS_LOG(INFO) << "Create special opencl context to share with OpenGL";
180   cl_context_properties context_prop[] = {CL_GL_CONTEXT_KHR, (cl_context_properties)eglGetCurrentContext(),
181                                           CL_EGL_DISPLAY_KHR, (cl_context_properties)eglGetCurrentDisplay(), 0};
182   context_ = new (std::nothrow) cl::Context(std::vector<cl::Device>{*device_}, context_prop, nullptr, nullptr, &ret);
183   if (context_ == nullptr || ret != CL_SUCCESS) {
184     MS_LOG(ERROR) << "Create special OpenCL context failed, Create common OpenCL context then.";
185     if (context_ != nullptr) {
186       delete context_;
187     }
188     context_ = new (std::nothrow) cl::Context(std::vector<cl::Device>{*device_}, nullptr, nullptr, nullptr, &ret);
189     if (context_ == nullptr) {
190       delete device_;
191       MS_LOG(ERROR) << "Create OpenCL context failed!";
192       return RET_ERROR;
193     }
194   }
195 #else
196   MS_LOG(INFO) << "Create common opencl context";
197 #ifdef Debug
198   std::vector<cl_context_properties> ctx_properties = {CL_CONTEXT_PLATFORM,
199                                                        (cl_context_properties)(*platforms)[0](),
200                                                        CL_PRINTF_CALLBACK_ARM,
201                                                        (cl_context_properties)printf_callback,
202                                                        CL_PRINTF_BUFFERSIZE_ARM,
203                                                        0x1000000,
204                                                        0};
205   context_ =
206     new (std::nothrow) cl::Context(std::vector<cl::Device>{*device_}, ctx_properties.data(), nullptr, nullptr, &ret);
207   if (context_ == nullptr || ret != CL_SUCCESS) {
208     context_ = new (std::nothrow) cl::Context(std::vector<cl::Device>{*device_}, nullptr, nullptr, nullptr, &ret);
209   }
210 #else
211   context_ = new (std::nothrow) cl::Context(std::vector<cl::Device>{*device_}, nullptr, nullptr, nullptr, &ret);
212 #endif
213 #endif
214   if (context_ == nullptr || ret != CL_SUCCESS) {
215     delete device_;
216     device_ = nullptr;
217     MS_LOG(ERROR) << "Context create failed: " << CLErrorCode(ret);
218     return RET_ERROR;
219   }
220 
221   default_command_queue_ = new (std::nothrow) cl::CommandQueue(*context_, *device_, 0, &ret);
222   if (default_command_queue_ == nullptr || ret != CL_SUCCESS) {
223     delete device_;
224     delete context_;
225     device_ = nullptr;
226     context_ = nullptr;
227     MS_LOG(ERROR) << "Command Queue create failed: " << CLErrorCode(ret);
228     return RET_ERROR;
229   }
230 
231   profiling_command_queue_ = new (std::nothrow) cl::CommandQueue(*context_, *device_, CL_QUEUE_PROFILING_ENABLE, &ret);
232   if (profiling_command_queue_ == nullptr || ret != CL_SUCCESS) {
233     delete device_;
234     delete context_;
235     delete default_command_queue_;
236     device_ = nullptr;
237     context_ = nullptr;
238     default_command_queue_ = nullptr;
239     MS_LOG(ERROR) << "Profiling command Queue create failed: " << CLErrorCode(ret);
240     return RET_ERROR;
241   }
242   return RET_OK;
243 }
244 
245 // Init will get platforms info, get devices info, create opencl context.
Init()246 int OpenCLRuntime::Init() {
247   std::unique_lock<std::mutex> lck(g_init_mtx);
248   if (init_state_ == InitSuccess) {
249     return RET_OK;
250   } else if (init_state_ == InitFailed) {
251     return RET_ERROR;
252   }
253   init_state_ = InitFailed;
254 
255   MS_LOG(INFO) << "OpenCL version: CL_TARGET_OPENCL_VERSION " << CL_TARGET_OPENCL_VERSION;
256   MS_LOG(INFO) << "CL_HPP_TARGET_OPENCL_VERSION " << CL_HPP_TARGET_OPENCL_VERSION;
257   MS_LOG(INFO) << "CL_HPP_MINIMUM_OPENCL_VERSION " << CL_HPP_MINIMUM_OPENCL_VERSION;
258 
259 #ifdef USE_OPENCL_WRAPPER
260   if (!lite::opencl::LoadOpenCLLibrary(&handle_)) {
261     MS_LOG(ERROR) << "Load OpenCL symbols failed!";
262     return RET_ERROR;
263   }
264 #endif  // USE_OPENCL_WRAPPER
265   std::vector<cl::Platform> platforms;
266   cl_int ret = cl::Platform::get(&platforms);
267   if (platforms.empty()) {
268     MS_LOG(ERROR) << "OpenCL Platform not found!" << CLErrorCode(ret);
269     return RET_ERROR;
270   }
271   auto ms_ret = InitGPUDevice(&platforms);
272   if (ms_ret != RET_OK) {
273     return ms_ret;
274   }
275 
276   // only support mali device.
277   if (gpu_info_.type == MALI || gpu_info_.type == MALI_T || gpu_info_.type == MALI_G) {
278     clImportMemoryARM = reinterpret_cast<clImportMemoryARMFunc>(dlsym(handle_, "clImportMemoryARM"));
279     if (clImportMemoryARM == nullptr) {
280       MS_LOG(ERROR) << "load func (clImportMemoryARM) failed!";
281       UnLoadOpenCLLibrary(handle_);
282       return false;
283     }
284   }
285 
286   ms_ret = InitQueue(&platforms);
287   if (ms_ret != RET_OK) {
288     return ms_ret;
289   }
290 
291   allocator_ = std::make_shared<OpenCLAllocator>(this);
292   if (allocator_ == nullptr) {
293     delete device_;
294     delete context_;
295     delete default_command_queue_;
296     delete profiling_command_queue_;
297     device_ = nullptr;
298     context_ = nullptr;
299     default_command_queue_ = nullptr;
300     profiling_command_queue_ = nullptr;
301     MS_LOG(ERROR) << "Command OpenCL allocator failed!";
302     return RET_ERROR;
303   }
304   LoadCache();
305   init_state_ = InitSuccess;
306   MS_LOG(INFO) << "OpenCLRuntime init done!";
307   return RET_OK;
308 }
309 
Uninit()310 int OpenCLRuntime::Uninit() {
311   std::unique_lock<std::mutex> lck(g_init_mtx);
312   if (init_state_ != InitSuccess) {
313     return RET_OK;
314   }
315   if (StoreCache() != RET_OK) {
316     MS_LOG(ERROR) << "StoreCache failed!";
317   }
318   program_map_.clear();
319   delete default_command_queue_;
320   delete profiling_command_queue_;
321   delete context_;
322   delete device_;
323   allocator_ = nullptr;
324   default_command_queue_ = nullptr;
325   profiling_command_queue_ = nullptr;
326   context_ = nullptr;
327   device_ = nullptr;
328   init_state_ = UnInit;
329   return RET_OK;
330 }
331 
~OpenCLRuntime()332 OpenCLRuntime::~OpenCLRuntime() { Uninit(); }
333 
Context()334 cl::Context *OpenCLRuntime::Context() { return context_; }
335 
Device()336 cl::Device *OpenCLRuntime::Device() { return device_; }
337 
DeviceGlobalMemoryCacheSize() const338 uint64_t OpenCLRuntime::DeviceGlobalMemoryCacheSize() const { return global_memery_cachesize_; }
339 
DeviceMaxWorkGroupSize() const340 uint64_t OpenCLRuntime::DeviceMaxWorkGroupSize() const { return max_work_group_size_; }
341 
DeviceComputeUnits() const342 uint32_t OpenCLRuntime::DeviceComputeUnits() const { return compute_units_; }
343 
DeviceMaxFreq() const344 uint32_t OpenCLRuntime::DeviceMaxFreq() const { return max_freq_; }
345 
346 // get kernel enqueue max work group size
GetMaxWorkGroupSize(const cl::Kernel & kernel)347 uint64_t OpenCLRuntime::GetMaxWorkGroupSize(const cl::Kernel &kernel) {
348   uint64_t max_workgroup_size = 0;
349   int ret = kernel.getWorkGroupInfo(*device_, CL_KERNEL_WORK_GROUP_SIZE, &max_workgroup_size);
350   if (ret != CL_SUCCESS) {
351     max_workgroup_size = 0;
352   }
353   return max_workgroup_size;
354 }
355 
356 // opencl 2.0 can get SubGroupSize.
GetSubGroupSize(const cl::Kernel & kernel,const cl::NDRange & range)357 uint32_t OpenCLRuntime::GetSubGroupSize(const cl::Kernel &kernel, const cl::NDRange &range) {
358   uint32_t sub_group_size = 0;
359 
360   if (ADRENO == gpu_info_.type) {
361 #if defined(CL_HPP_TARGET_OPENCL_VERSION) && CL_HPP_TARGET_OPENCL_VERSION >= 200 && \
362   defined(CL_TARGET_OPENCL_VERSION) && CL_TARGET_OPENCL_VERSION >= 210 && defined(CL_HPP_USE_CL_SUB_GROUPS_KHR)
363     cl_int cl_ret;
364     sub_group_size = kernel.getSubGroupInfo<CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE>(*device_, range, &cl_ret);
365     if (cl_ret != CL_SUCCESS) {
366       CHECK_CL_SUCCESS(cl_ret)
367       sub_group_size = 0;
368     }
369 #else
370     sub_group_size = 0;
371 #endif
372   }
373 
374   return sub_group_size;
375 }
376 
GetGpuInfo()377 GpuInfo OpenCLRuntime::GetGpuInfo() { return gpu_info_; }
378 
GetFp16Enable() const379 bool OpenCLRuntime::GetFp16Enable() const { return fp16_enable_; }
380 
381 // if support fp16, set fp16 will success.
SetFp16Enable(bool enable)382 bool OpenCLRuntime::SetFp16Enable(bool enable) {
383   fp16_enable_ = enable && support_fp16_;
384   return fp16_enable_ == enable;
385 }
386 
BuildKernel(const cl::Kernel & kernel,const std::string & program_name,const std::string & kernel_name,const std::vector<std::string> & build_options_ext,const bool is_builtin)387 int OpenCLRuntime::BuildKernel(const cl::Kernel &kernel, const std::string &program_name,
388                                const std::string &kernel_name, const std::vector<std::string> &build_options_ext,
389                                const bool is_builtin) {
390   std::string build_option;
391   if (is_builtin) {
392     build_option = default_build_option_;
393     if (fp16_enable_) {
394       build_option +=
395         " -DFP16_ENABLE=1 -DFLT=half -DFLT4=half4 -DFLT16=half16 -DAS_FLT4=as_half4 -DAS_UINT4=as_ushort4 "
396         "-DUINT4=ushort4"
397         " -DTO_FLT=convert_half -DTO_FLT4=convert_half4";
398     } else {
399       build_option +=
400         " -DFP16_ENABLE=0 -DFLT=float -DFLT4=float4 -DFLT16=float16 -DAS_FLT4=as_float4 -DAS_UINT4=as_uint4 "
401         "-DUINT4=uint4"
402         " -DTO_FLT=convert_float -DTO_FLT4=convert_float4";
403     }
404     build_option += " -DMAX_IMAGE2D_WIDTH=" + std::to_string(max_image2d_width_);
405   }
406   build_option =
407     std::accumulate(build_options_ext.begin(), build_options_ext.end(), build_option,
408                     [](const std::string &options, const std::string &option) { return options + " " + option; });
409 
410   cl::Program program;
411   auto program_key = std::make_pair(program_name, build_option);
412   auto iter = program_map_.find(program_key);
413   if (iter != program_map_.end()) {
414     program = iter->second;
415   } else {
416     flush_cache_ = true;
417     auto status = this->LoadProgram(program_name, &program);
418     if (!status) {
419       MS_LOG(ERROR) << "load program (" << program_name << ") failed!";
420       return RET_ERROR;
421     }
422     status = this->BuildProgram(build_option, program);
423     if (!status) {
424       MS_LOG(ERROR) << program_name << " build failed!";
425       return RET_ERROR;
426     }
427     program_map_.emplace(program_key, program);
428   }
429 
430   cl_int ret;
431   const_cast<cl::Kernel &>(kernel) = cl::Kernel(program, kernel_name.c_str(), &ret);
432   if (ret != CL_SUCCESS) {
433     MS_LOG(ERROR) << kernel_name << " Kernel create failed:" << CLErrorCode(ret);
434     return RET_ERROR;
435   }
436   return RET_OK;
437 }
438 
439 // Run Kernel with 1D, 2D, 3D group size, and local size can be empty.
RunKernel(const cl::Kernel & kernel,const cl::NDRange & global,const cl::NDRange & local,cl::CommandQueue * command_queue,cl::Event * event)440 int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const cl::NDRange &global, const cl::NDRange &local,
441                              cl::CommandQueue *command_queue, cl::Event *event) {
442   if (command_queue == nullptr) {
443     if (profiling_) {
444       command_queue = profiling_command_queue_;
445     } else {
446       command_queue = default_command_queue_;
447     }
448   }
449   MS_ASSERT(local.size() == 0 || local.size() == global.size());
450   cl_int ret = CL_SUCCESS;
451   ret = command_queue->enqueueNDRangeKernel(kernel, cl::NullRange, global, local, nullptr, event);
452   if (ret != CL_SUCCESS) {
453     MS_LOG(ERROR) << "Kernel execute failed:" << CLErrorCode(ret);
454     return RET_ERROR;
455   }
456   static int cnt = 0;
457   const int flush_period = 10;
458   if (cnt % flush_period == 0) {
459     auto flush_ret = command_queue->flush();
460     if (flush_ret != CL_SUCCESS) {
461       MS_LOG(WARNING) << "CL Flush failed:" << CLErrorCode(ret);
462     }
463   }
464   cnt++;
465   MS_LOG(DEBUG) << "RunKernel success!";
466   if (profiling_) {
467     event->wait();
468   }
469   return RET_OK;
470 }
471 
472 // get gpu divce type
ParseGpuInfo(std::string device_name,std::string device_version)473 GpuInfo OpenCLRuntime::ParseGpuInfo(std::string device_name, std::string device_version) {
474   GpuInfo info;
475   if (device_name == "QUALCOMM Adreno(TM)") {
476     info.type = ADRENO;
477   } else if (device_name.find("Mali") != std::string::npos) {
478     info.type = MALI;
479     // Mali type MALI-G or MALI_T
480     if (device_name.find("Mali-G") != std::string::npos) {
481       info.type = MALI_G;
482     } else if (device_name.find("Mali-T") != std::string::npos) {
483       info.type = MALI_T;
484     }
485   }
486   return info;
487 }
488 
LoadSource(const std::string & program_name,const std::string & source)489 bool OpenCLRuntime::LoadSource(const std::string &program_name, const std::string &source) {
490   auto it_source = g_source_map.find(program_name);
491   if (it_source == g_source_map.end()) {
492     g_source_map.emplace(program_name, source);
493   }
494   return true;
495 }
496 
497 // load program with program name.
LoadProgram(const std::string & program_name,cl::Program * program)498 bool OpenCLRuntime::LoadProgram(const std::string &program_name, cl::Program *program) {
499   auto it_source = g_source_map.find(program_name);
500   if (it_source != g_source_map.end()) {
501     cl::Program::Sources sources;
502     sources.push_back(it_source->second);
503     *program = cl::Program(*context_, sources);
504     return true;
505   } else {
506     MS_LOG(ERROR) << "Can't find kernel source !";
507     return false;
508   }
509 }
510 
511 // build program with build options
BuildProgram(const std::string & build_option,const cl::Program & program)512 bool OpenCLRuntime::BuildProgram(const std::string &build_option, const cl::Program &program) {
513   cl_int ret = program.build({*device_}, build_option.c_str());
514   if (ret != CL_SUCCESS) {
515     if (program.getBuildInfo<CL_PROGRAM_BUILD_STATUS>(*device_) == CL_BUILD_ERROR) {
516       std::string build_log = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(*device_);
517       MS_LOG(ERROR) << "Program build log: " << build_log;
518     }
519     MS_LOG(ERROR) << "Build program failed: " << CLErrorCode(ret);
520     return false;
521   }
522   return true;
523 }
524 
ReadOrWriteImage(void * buffer,void * data,bool is_read)525 int OpenCLRuntime::ReadOrWriteImage(void *buffer, void *data, bool is_read) {
526   cl::CommandQueue *command_queue = profiling_ ? profiling_command_queue_ : default_command_queue_;
527   auto *image = allocator_->GetImage(buffer);
528   if (image == nullptr) {
529     MS_LOG(WARNING) << "Can't get Image2D for " << buffer;
530     return RET_ERROR;
531   }
532   ImageSize img_size;
533   int ret = allocator_->GetImageSize(buffer, &img_size);
534   if (ret != RET_OK) {
535     MS_LOG(WARNING) << "Can't get GetImageSize for " << buffer;
536     return RET_ERROR;
537   }
538   cl::array<size_t, 3> origin = {0, 0, 0};
539   cl::array<size_t, 3> region = {img_size.width, img_size.height, 1};
540   if (is_read) {
541     ret = command_queue->enqueueReadImage(*image, true, origin, region, 0, 0, data, nullptr, nullptr);
542   } else {
543     ret = command_queue->enqueueWriteImage(*image, true, origin, region, 0, 0, data, nullptr, nullptr);
544   }
545   if (ret != CL_SUCCESS) {
546     MS_LOG(ERROR) << CLErrorCode(ret);
547     return RET_ERROR;
548   }
549   return RET_OK;
550 }
551 
ReadImage(void * buffer,void * dst_data)552 int OpenCLRuntime::ReadImage(void *buffer, void *dst_data) { return ReadOrWriteImage(buffer, dst_data, true); }
553 
WriteImage(void * buffer,void * src_data)554 int OpenCLRuntime::WriteImage(void *buffer, void *src_data) { return ReadOrWriteImage(buffer, src_data, false); }
555 
CopyDeviceMemToHost(void * dst,const void * src,size_t size,cl::CommandQueue * command_queue,bool sync) const556 bool OpenCLRuntime::CopyDeviceMemToHost(void *dst, const void *src, size_t size, cl::CommandQueue *command_queue,
557                                         bool sync) const {
558   if (command_queue == nullptr) {
559     command_queue = default_command_queue_;
560   }
561   cl_int cl_ret = CL_SUCCESS;
562   const cl::Buffer *buffer = static_cast<const cl::Buffer *>(src);
563   if (command_queue != nullptr) {
564     cl_ret = command_queue->enqueueReadBuffer(*buffer, sync, 0, size, dst);
565   }
566   return cl_ret == CL_SUCCESS;
567 }
568 
CopyHostMemToDevice(const void * dst,const void * src,size_t size,cl::CommandQueue * command_queue,bool sync) const569 bool OpenCLRuntime::CopyHostMemToDevice(const void *dst, const void *src, size_t size, cl::CommandQueue *command_queue,
570                                         bool sync) const {
571   if (command_queue == nullptr) {
572     command_queue = default_command_queue_;
573   }
574   cl_int cl_ret = CL_SUCCESS;
575   const cl::Buffer *buffer = static_cast<const cl::Buffer *>(dst);
576   if (command_queue != nullptr) {
577     cl_ret = command_queue->enqueueWriteBuffer(*buffer, sync, 0, size, src);
578   }
579   return cl_ret == CL_SUCCESS;
580 }
581 
MapBuffer(const cl::Buffer & buffer,int flags,size_t size,cl::CommandQueue * command_queue,bool sync) const582 void *OpenCLRuntime::MapBuffer(const cl::Buffer &buffer, int flags, size_t size, cl::CommandQueue *command_queue,
583                                bool sync) const {
584   if (command_queue == nullptr) {
585     command_queue = default_command_queue_;
586   }
587   return command_queue->enqueueMapBuffer(buffer, sync, flags, 0, size);
588 }
589 
MapBuffer(void * host_ptr,int flags,size_t size,cl::CommandQueue * command_queue,bool sync) const590 int OpenCLRuntime::MapBuffer(void *host_ptr, int flags, size_t size, cl::CommandQueue *command_queue, bool sync) const {
591   if (GetSVMCapabilities() & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) {
592     return RET_ERROR;
593   }
594   if (command_queue == nullptr) {
595     command_queue = default_command_queue_;
596   }
597   if (clEnqueueSVMMap(command_queue->get(), sync, flags, host_ptr, size, 0, nullptr, nullptr) != CL_SUCCESS) {
598     return RET_ERROR;
599   }
600   return RET_OK;
601 }
602 
MapBuffer(const cl::Image2D & buffer,bool sync,int flags,const std::vector<size_t> & region,cl::CommandQueue * command_queue) const603 void *OpenCLRuntime::MapBuffer(const cl::Image2D &buffer, bool sync, int flags, const std::vector<size_t> &region,
604                                cl::CommandQueue *command_queue) const {
605   if (command_queue == nullptr) {
606     command_queue = default_command_queue_;
607   }
608   cl::size_type row_pitch;
609   cl::size_type slice_pitch;
610   cl::array<cl::size_type, 3> origin_{0, 0, 0};
611   cl::array<cl::size_type, 3> region_{region[0], region[1], region[2]};
612   return command_queue->enqueueMapImage(buffer, sync, flags, origin_, region_, &row_pitch, &slice_pitch);
613 }
614 
UnmapBuffer(const cl::Memory & buffer,void * host_ptr,cl::CommandQueue * command_queue) const615 int OpenCLRuntime::UnmapBuffer(const cl::Memory &buffer, void *host_ptr, cl::CommandQueue *command_queue) const {
616   if (command_queue == nullptr) {
617     command_queue = default_command_queue_;
618   }
619   return command_queue->enqueueUnmapMemObject(buffer, host_ptr);
620 }
621 
UnmapBuffer(void * host_ptr,cl::CommandQueue * command_queue) const622 int OpenCLRuntime::UnmapBuffer(void *host_ptr, cl::CommandQueue *command_queue) const {
623   if (GetSVMCapabilities() & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) {
624     return RET_OK;
625   }
626   if (command_queue == nullptr) {
627     command_queue = default_command_queue_;
628   }
629   return clEnqueueSVMUnmap(command_queue->get(), host_ptr, 0, nullptr, nullptr);
630 }
631 
SyncCommandQueue(cl::CommandQueue * command_queue)632 bool OpenCLRuntime::SyncCommandQueue(cl::CommandQueue *command_queue) {
633   if (command_queue == nullptr) {
634     command_queue = default_command_queue_;
635   }
636   cl_int ret = command_queue->finish();
637   if (ret != CL_SUCCESS) {
638     MS_LOG(ERROR) << "Command queue sync failed: " << CLErrorCode(ret);
639     return RET_ERROR;
640   }
641   return ret == CL_SUCCESS;
642 }
643 
GetKernelMaxWorkGroupSize(cl_kernel kernel,cl_device_id device_id)644 int OpenCLRuntime::GetKernelMaxWorkGroupSize(cl_kernel kernel, cl_device_id device_id) {
645   size_t max_work_group_size;
646   cl_int ret = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t),
647                                         &max_work_group_size, nullptr);
648   if (ret != CL_SUCCESS) {
649     MS_LOG(ERROR) << "Failed to get info CL_KERNEL_WORK_GROUP_SIZE " << CLErrorCode(ret);
650   }
651   return static_cast<int>(max_work_group_size);
652 }
653 
GetKernelFromBinary(const std::string & kernel_name)654 cl::Kernel OpenCLRuntime::GetKernelFromBinary(const std::string &kernel_name) {
655   cl_int ret = CL_SUCCESS;
656   cl::Kernel kernel = cl::Kernel(binary_program_, kernel_name.c_str(), &ret);
657   if (ret != CL_SUCCESS) {
658     MS_LOG(ERROR) << "Create kernel with binary program failed: " << CLErrorCode(ret);
659   }
660   return kernel;
661 }
662 
663 // build program with IL
CreateProgramFromIL(const std::vector<char> & binary,const std::string & flag)664 cl::Program OpenCLRuntime::CreateProgramFromIL(const std::vector<char> &binary, const std::string &flag) {
665 #if defined(CL_HPP_TARGET_OPENCL_VERSION) && CL_HPP_TARGET_OPENCL_VERSION >= 210
666   cl::Program program = cl::Program(*context_, binary);
667   bool status = BuildProgram(default_build_opts_, program);
668   if (!status) {
669     MS_LOG(ERROR) << "Build program with IL failed!";
670   }
671   return program;
672 #else
673   MS_LOG(ERROR) << "Create program with IL failed! The compute capabitity of device should be 2.1 and higher.";
674   return cl::Program();
675 #endif
676 }
677 
678 // build program with binary
CreateProgramFromBinary(const std::vector<unsigned char> & binary,const std::string & build_option)679 cl::Program OpenCLRuntime::CreateProgramFromBinary(const std::vector<unsigned char> &binary,
680                                                    const std::string &build_option) {
681   cl::Program program = cl::Program(*context_, {*device_}, {binary});
682   bool status = BuildProgram(build_option, program);
683   if (!status) {
684     MS_LOG(ERROR) << "Build program with binary failed!";
685   }
686   return program;
687 }
688 
GetProgramBinary(const cl::Program & program)689 std::vector<unsigned char> OpenCLRuntime::GetProgramBinary(const cl::Program &program) {
690   cl_int ret = CL_SUCCESS;
691   auto binarys = program.getInfo<CL_PROGRAM_BINARIES>(&ret);
692   if (ret != CL_SUCCESS) {
693     MS_LOG(ERROR) << "Get program binary failed: " << CLErrorCode(ret);
694   }
695   if (binarys.empty()) {
696     MS_LOG(ERROR) << "binarys is empty";
697     return {};
698   }
699   return binarys.front();
700 }
701 
LoadCache()702 void OpenCLRuntime::LoadCache() {
703   if (!enable_cache_) {
704     return;
705   }
706   size_t len;
707   std::unique_ptr<char[]> buf(lite::ReadFile(cache_path_.c_str(), &len));
708   if (buf == nullptr) {
709     MS_LOG(ERROR) << "Load opencl cache fail: buf == nullptr";
710     return;
711   }
712   auto gpu_cache = schema::GetGpuCache(buf.get());
713   if (gpu_cache == nullptr) {
714     MS_LOG(ERROR) << "Load opencl cache fail: gpu_cache == nullptr";
715     return;
716   }
717   auto *bins = gpu_cache->allBins();
718   if (bins == nullptr) {
719     MS_LOG(ERROR) << "Load opencl cache fail: bins == nullptr";
720     return;
721   }
722   for (auto i = 0; i < bins->size(); ++i) {
723     auto *bin = bins->template GetAs<schema::ProgramBinary>(i);
724     if (bin == nullptr) {
725       MS_LOG(ERROR) << "kernel_bin[" << i << "] null";
726       return;
727     }
728     auto *pdata = bin->data();
729     MS_ASSERT(pdata);
730     if (pdata->size() == 0) {
731       continue;
732     }
733     std::vector<unsigned char> binary(pdata->begin(), pdata->end());
734     auto program = CreateProgramFromBinary(binary, bin->build_option()->str());
735     program_map_.emplace(std::make_pair(bin->program_name()->str(), bin->build_option()->str()), program);
736     MS_LOG(INFO) << "LoadCache " << bin->program_name() << " success, size=" << binary.size();
737   }
738   MS_LOG(INFO) << "Init opencl cache success";
739 }
740 
StoreCache()741 int OpenCLRuntime::StoreCache() {
742   if (!enable_cache_) {
743     return RET_OK;
744   }
745   if (!flush_cache_) {
746     return RET_OK;
747   }
748   auto fbb = std::make_unique<flatbuffers::FlatBufferBuilder>();
749   if (fbb == nullptr) {
750     MS_LOG(ERROR) << "new opencl FlatBufferBuilder fail";
751     return RET_ERROR;
752   }
753   std::vector<flatbuffers::Offset<schema::ProgramBinary>> program_binarys;
754   for (const auto &kv : program_map_) {
755     auto program_name = kv.first.first;
756     auto build_option = kv.first.second;
757     cl::Program program = kv.second;
758     auto binary = this->GetProgramBinary(program);
759     std::vector<int32_t> shape;
760     auto tune = schema::CreateTuneParam(*fbb, fbb->CreateVector<int32_t>(shape), fbb->CreateVector<int32_t>(shape),
761                                         fbb->CreateVector<int32_t>(shape), fbb->CreateVector<int32_t>(shape));
762     auto program_binary = schema::CreateProgramBinary(
763       *fbb, fbb->CreateString(program_name), fbb->CreateString(build_option), tune, fbb->CreateVector<uint8_t>(binary));
764     program_binarys.emplace_back(program_binary);
765     MS_LOG(INFO) << "StoreCache " << program_name << " success, size=" << binary.size();
766   }
767 
768   auto data = fbb->CreateVector<flatbuffers::Offset<schema::ProgramBinary>>(program_binarys);
769   auto name = fbb->CreateString("OpenCLCache");
770   auto version = fbb->CreateString(cache_version_);
771   auto gpu_cache = schema::CreateGpuCache(*fbb, name, version, data);
772   fbb->Finish(gpu_cache);
773   uint8_t *buf = fbb->GetBufferPointer();
774   if (WriteToBin(cache_path_, reinterpret_cast<void *>(buf), fbb->GetSize()) != RET_OK) {
775     MS_LOG(ERROR) << "WriteToBin failed.";
776     return RET_ERROR;
777   }
778   MS_LOG(INFO) << "store opencl cache ok, size=" << fbb->GetSize();
779   return RET_OK;
780 }
781 
CreateSharedMemoryBuffer(size_t size,void * host_ptr)782 cl::Buffer *OpenCLRuntime::CreateSharedMemoryBuffer(size_t size, void *host_ptr) {
783   cl_int error = CL_SUCCESS;
784   cl_mem cl_buffer = clImportMemoryARM(context_->get(), CL_MEM_READ_WRITE, NULL, host_ptr, size, &error);
785   if (error != CL_SUCCESS) {
786     MS_LOG(ERROR) << "Create OpenCL shared memory failed for" << CLErrorCode(error);
787     return nullptr;
788   }
789   cl::Buffer *buffer = new (std::nothrow) cl::Buffer(cl_buffer, false);
790   if (buffer == nullptr) {
791     MS_LOG(ERROR) << "New OpenCL Buffer failed";
792     return nullptr;
793   }
794   return buffer;
795 }
796 }  // namespace mindspore::lite::opencl
797