• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2019 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_GPU_OPENCL_OPENCL_RUNTIME_H_
18 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_GPU_OPENCL_OPENCL_RUNTIME_H_
19 #include <vector>
20 #include <map>
21 #include <memory>
22 #include <set>
23 #include <string>
24 #include <utility>
25 #include <type_traits>
26 #include "dtype/type_id.h"
27 #include "src/common/log_adapter.h"
28 #include "src/litert/kernel/gpu/opencl/opencl_wrapper.h"
29 #include "src/litert/kernel/gpu/opencl/opencl_allocator.h"
30 #include "schema/gpu_cache_generated.h"
31 #define EXT_ARM_IMPORT_MEMORY_HOST "cl_arm_import_memory_host"
32 
33 namespace mindspore::lite::opencl {
34 enum GpuType { OTHER = 0, ADRENO = 1, MALI = 2, MALI_T = 3, MALI_G = 4, MALI_G78 = 5 };
35 enum TuningMode { DEFAULT = 0, FAST = 1, EXTREME = 2 };
36 enum InitState { UnInit = 0, InitSuccess = 1, InitFailed = 2 };
37 
38 struct GpuInfo {
39   GpuType type = OTHER;
40 };
41 class OpenCLRuntimeInnerWrapper;
42 class OpenCLRuntimeWrapper;
43 class OpenCLRuntime {
44  public:
45   friend OpenCLRuntimeInnerWrapper;
46   friend OpenCLRuntimeWrapper;
47   ~OpenCLRuntime();
48   OpenCLRuntime(const OpenCLRuntime &) = delete;
49   OpenCLRuntime &operator=(const OpenCLRuntime &) = delete;
50 
51   int Init();
52   int Uninit();
53 
54   cl::Context *Context();
55   cl::Device *Device();
GetAllocator()56   std::shared_ptr<OpenCLAllocator> GetAllocator() { return allocator_; }
GetDefaultCommandQueue()57   cl::CommandQueue *GetDefaultCommandQueue() { return profiling_ ? profiling_command_queue_ : default_command_queue_; }
58   uint64_t DeviceGlobalMemoryCacheSize() const;
59   uint64_t DeviceMaxWorkGroupSize() const;
60   uint32_t DeviceComputeUnits() const;
61   uint32_t DeviceMaxFreq() const;
62   uint64_t GetMaxWorkGroupSize(const cl::Kernel &kernel);
63   uint32_t GetSubGroupSize(const cl::Kernel &kernel, const cl::NDRange &range = cl::NullRange);
GetGlobalMemSize()64   uint64_t GetGlobalMemSize() { return global_memery_size_; }
GetMaxAllocSize()65   uint64_t GetMaxAllocSize() { return max_alloc_size_; }
GetMaxImage2DWidth()66   uint64_t GetMaxImage2DWidth() { return max_image2d_width_; }
GetMaxImage2DHeight()67   uint64_t GetMaxImage2DHeight() { return max_image2d_height_; }
68   GpuInfo GetGpuInfo();
69   bool GetFp16Enable() const;
70   bool SetFp16Enable(bool enable);
71   bool GetGLTextureEnable() const;
72   bool SetGLTextureEnable(bool enable);
73 
SetGLContext(void * gl_context)74   void SetGLContext(void *gl_context) { gl_context_ = gl_context; }
GetGLContext()75   void *GetGLContext() const { return gl_context_; }
CheckGLContext()76   bool CheckGLContext() const { return (GetGLContext() != nullptr); }
77 
SetGLDisplay(void * gl_display)78   void SetGLDisplay(void *gl_display) { gl_display_ = gl_display; }
GetGLDisplay()79   void *GetGLDisplay() const { return gl_display_; }
CheckGLDisplay()80   bool CheckGLDisplay() const { return (GetGLDisplay() != nullptr); }
81 
GetSVMEnable()82   bool GetSVMEnable() const { return svm_enable_; }
SetSVMEnable(bool enable)83   void SetSVMEnable(bool enable) { svm_enable_ = enable; }
GetWorkItemSize()84   const std::vector<size_t> &GetWorkItemSize() const { return max_work_item_sizes_; }
GetImagePitchAlignment()85   uint32_t GetImagePitchAlignment() const { return image_pitch_align_; }
GetSVMCapabilities()86   cl_device_svm_capabilities GetSVMCapabilities() const { return svm_enable_ ? svm_capabilities_ : 0; }
87 
88   template <typename T>
89   typename std::enable_if<std::is_pointer<T>::value, cl_int>::type SetKernelArg(const cl::Kernel &kernel,
90                                                                                 uint32_t index, const T value,
91                                                                                 bool force_buffer = false) {
92     if (value == nullptr) {
93       MS_LOG(ERROR) << "value is nullptr.";
94       return CL_INVALID_VALUE;
95     }
96     auto svm_capabilities = GetSVMCapabilities();
97     if (svm_capabilities) {
98       MS_LOG(DEBUG) << "Set kernel arg[" << index << "] SVM pointer " << value;
99       return clSetKernelArgSVMPointer(kernel.get(), index, value);
100     }
101     lite::opencl::MemType mem_type;
102     void *buffer = allocator_->GetOpenclMemPtr(value, &mem_type, force_buffer);
103     if (buffer == nullptr) {
104       MS_LOG(ERROR) << "buffer is nullptr.";
105       return CL_INVALID_VALUE;
106     }
107     MS_LOG(DEBUG) << "Set kernel arg[" << index << "] OpenCL "
108                   << (mem_type == lite::opencl::MemType::IMG ? "Image " : "Buffer ") << buffer
109                   << ", host_ptr: " << value;
110     if (mem_type == lite::opencl::MemType::IMG) {
111       return const_cast<cl::Kernel &>(kernel).setArg(index, *reinterpret_cast<cl::Image2D *>(buffer));
112     } else {
113       return const_cast<cl::Kernel &>(kernel).setArg(index, *reinterpret_cast<cl::Buffer *>(buffer));
114     }
115   }
116 
117   template <typename T>
SetKernelArg(const cl::Kernel & kernel,uint32_t index,const T value)118   typename std::enable_if<!std::is_pointer<T>::value, cl_int>::type SetKernelArg(const cl::Kernel &kernel,
119                                                                                  uint32_t index, const T value) {
120     return const_cast<cl::Kernel &>(kernel).setArg(index, value);
121   }
122 
123   cl::Program CreateProgramFromIL(const std::vector<char> &binary, const std::string &flag);
124   cl::Program CreateProgramFromBinary(const std::vector<unsigned char> &binary, const std::string &flag);
125   cl::Kernel GetKernelFromBinary(const std::string &kernel_name);
126   std::vector<unsigned char> GetProgramBinary(const cl::Program &program);
127   bool LoadSource(const std::string &program_name, const std::string &source);
128   int BuildKernel(const cl::Kernel &kernel, const std::string &program_name, const std::string &kernel_name,
129                   const std::vector<std::string> &build_options_ext = {}, const bool is_builtin = true);
130   int RunKernel(const cl::Kernel &kernel, const cl::NDRange &global, const cl::NDRange &local,
131                 cl::CommandQueue *command_queue = nullptr, cl::Event *event = nullptr);
132   int ReadOrWriteImage(void *buffer, void *data, bool is_read);
133   int ReadImage(void *buffer, void *dst_data);
134   int WriteImage(void *buffer, void *src_data);
135   bool CopyDeviceMemToHost(void *dst, const void *src, size_t size, cl::CommandQueue *command_queue = nullptr,
136                            bool sync = false) const;
137   bool CopyHostMemToDevice(const void *dst, const void *src, size_t size, cl::CommandQueue *command_queue = nullptr,
138                            bool sync = false) const;
139   void *MapBuffer(const cl::Buffer &buffer, int map_flags, size_t size, cl::CommandQueue *command_queue = nullptr,
140                   bool sync = false) const;
141   void *MapBuffer(const cl::Image2D &buffer, bool sync, int flags, const std::vector<size_t> &region,
142                   cl::CommandQueue *command_queue = nullptr) const;
143   int MapBuffer(void *host_ptr, int map_flags, size_t size, cl::CommandQueue *command_queue = nullptr,
144                 bool sync = false) const;
145   int UnmapBuffer(const cl::Memory &buffer, void *host_ptr, cl::CommandQueue *command_queue = nullptr) const;
146   int UnmapBuffer(void *host_ptr, cl::CommandQueue *command_queue = nullptr) const;
147   bool SyncCommandQueue(cl::CommandQueue *command_queue = nullptr);
148 
149   /**
150    * Get kernel max worker group size.
151    * @param kernel
152    * @param device_id
153    * @return max_work_group_size
154    */
155   int GetKernelMaxWorkGroupSize(cl_kernel kernel, cl_device_id device_id);
SetTuningMode(TuningMode mode)156   void SetTuningMode(TuningMode mode) { tuning_mode_ = mode; }
GetTuningMode()157   TuningMode GetTuningMode() const { return tuning_mode_; }
158 
isProfiling()159   bool isProfiling() const { return profiling_; }
SetProfiling(bool profiling)160   void SetProfiling(bool profiling) { profiling_ = profiling; }
isExtensionEnable(std::string ext)161   bool isExtensionEnable(std::string ext) { return supported_extensions_.find(ext) != std::string::npos; }
162   cl::Buffer *CreateSharedMemoryBuffer(size_t size, void *host_ptr);
GetCacheLineSize()163   size_t GetCacheLineSize() const { return cache_line_size_; }
164 
165  private:
166   static OpenCLRuntime *GetInstance();
167   static void DeleteInstance();
168   OpenCLRuntime() = default;
169   GpuInfo ParseGpuInfo(std::string device_name, std::string device_version);
170 
171   bool LoadProgram(const std::string &program_name, cl::Program *program);
172   bool BuildProgram(const std::string &build_options, const cl::Program &program);
173   int InitGPUDevice(std::vector<cl::Platform> *platforms);
174   int InitQueue(std::vector<cl::Platform> *platforms);
175 
176  private:
177   static InitState init_state_;
178   static size_t instance_count_;
179   static OpenCLRuntime *ocl_runtime_instance_;
180   cl::CommandQueue *default_command_queue_{nullptr};
181   cl::CommandQueue *profiling_command_queue_{nullptr};
182   cl::Context *context_{nullptr};
183   cl::Device *device_{nullptr};
184   std::shared_ptr<OpenCLAllocator> allocator_{nullptr};
185   std::map<std::pair<std::string, std::string>, cl::Program> program_map_;
186   cl::Program binary_program_;
187   uint64_t global_memery_cachesize_{0};
188   uint64_t global_memery_size_{0};
189   uint64_t max_alloc_size_{0};
190   uint64_t max_image2d_width_{0};
191   uint64_t max_image2d_height_{0};
192   uint64_t max_work_group_size_{1};
193   uint32_t compute_units_{0};
194   uint32_t max_freq_{0};
195   std::string default_build_option_{"-cl-mad-enable -cl-fast-relaxed-math -Werror"};
196   GpuInfo gpu_info_;
197   bool support_fp16_{false};
198   bool fp16_enable_{false};
199   bool svm_enable_{false};
200   cl_device_svm_capabilities svm_capabilities_{0};
201   cl_uint image_pitch_align_{0};
202   std::vector<size_t> max_work_item_sizes_;
203   void *handle_{nullptr};
204   bool enable_gl_texture_{false};
205   void *gl_context_{nullptr};
206   void *gl_display_{nullptr};
207   TuningMode tuning_mode_{TuningMode::DEFAULT};
208 #if MS_OPENCL_PROFILE
209   bool profiling_{true};
210 #else
211   bool profiling_{false};
212   std::string supported_extensions_{""};
213   size_t cache_line_size_{1};
214 #endif
215   // for cache
216  private:
217   void LoadCache();
218   int StoreCache();
219 #ifdef MS_OPENCL_BINARY_CACHE
220   bool enable_cache_{true};
221 #else
222   bool enable_cache_{false};
223 #endif
224   bool flush_cache_{false};
225   std::string cache_path_{"/data/local/tmp/.opencl_cache"};
226   const std::string cache_version_{"V0.1"};
227 };
228 
229 class OpenCLRuntimeInnerWrapper {
230  public:
OpenCLRuntimeInnerWrapper()231   OpenCLRuntimeInnerWrapper() { ocl_runtime_ = OpenCLRuntime::GetInstance(); }
~OpenCLRuntimeInnerWrapper()232   ~OpenCLRuntimeInnerWrapper() { OpenCLRuntime::DeleteInstance(); }
233   OpenCLRuntimeInnerWrapper(const OpenCLRuntimeInnerWrapper &) = delete;
234   OpenCLRuntimeInnerWrapper &operator=(const OpenCLRuntimeInnerWrapper &) = delete;
GetInstance()235   OpenCLRuntime *GetInstance() { return ocl_runtime_; }
236 
237  private:
238   OpenCLRuntime *ocl_runtime_{nullptr};
239 };
240 }  // namespace mindspore::lite::opencl
241 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_GPU_OPENCL_OPENCL_RUNTIME_H_
242