• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2019 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef MINDSPORE_LITE_SRC_OPENCL_RUNTIME_H_
18 #define MINDSPORE_LITE_SRC_OPENCL_RUNTIME_H_
19 #include <vector>
20 #include <map>
21 #include <memory>
22 #include <set>
23 #include <string>
24 #include <utility>
25 #include <type_traits>
26 #include "ir/dtype/type_id.h"
27 #include "src/common/log_adapter.h"
28 #include "src/runtime/gpu/opencl/opencl_wrapper.h"
29 #include "src/runtime/gpu/opencl/opencl_allocator.h"
30 #include "schema/gpu_cache_generated.h"
31 #define EXT_ARM_IMPORT_MEMORY_HOST "cl_arm_import_memory_host"
32 
33 namespace mindspore::lite::opencl {
34 enum GpuType { OTHER = 0, ADRENO = 1, MALI = 2, MALI_T = 3, MALI_G = 4 };
35 enum TuningMode { DEFAULT = 0, FAST = 1, EXTREME = 2 };
36 enum InitState { UnInit = 0, InitSuccess = 1, InitFailed = 2 };
37 
38 struct GpuInfo {
39   GpuType type = OTHER;
40 };
41 class OpenCLRuntimeInnerWrapper;
42 class OpenCLRuntimeWrapper;
43 class OpenCLRuntime {
44  public:
45   friend OpenCLRuntimeInnerWrapper;
46   friend OpenCLRuntimeWrapper;
47   ~OpenCLRuntime();
48   OpenCLRuntime(const OpenCLRuntime &) = delete;
49   OpenCLRuntime &operator=(const OpenCLRuntime &) = delete;
50 
51   int Init();
52   int Uninit();
53 
54   cl::Context *Context();
55   cl::Device *Device();
GetAllocator()56   std::shared_ptr<OpenCLAllocator> GetAllocator() { return allocator_; }
GetDefaultCommandQueue()57   cl::CommandQueue *GetDefaultCommandQueue() { return profiling_ ? profiling_command_queue_ : default_command_queue_; }
58   uint64_t DeviceGlobalMemoryCacheSize() const;
59   uint64_t DeviceMaxWorkGroupSize() const;
60   uint32_t DeviceComputeUnits() const;
61   uint32_t DeviceMaxFreq() const;
62   uint64_t GetMaxWorkGroupSize(const cl::Kernel &kernel);
63   uint32_t GetSubGroupSize(const cl::Kernel &kernel, const cl::NDRange &range = cl::NullRange);
GetGlobalMemSize()64   uint64_t GetGlobalMemSize() { return global_memery_size_; }
GetMaxAllocSize()65   uint64_t GetMaxAllocSize() { return max_alloc_size_; }
GetMaxImage2DWidth()66   uint64_t GetMaxImage2DWidth() { return max_image2d_width_; }
GetMaxImage2DHeight()67   uint64_t GetMaxImage2DHeight() { return max_image2d_height_; }
68   GpuInfo GetGpuInfo();
69   bool GetFp16Enable() const;
70   bool SetFp16Enable(bool enable);
GetSVMEnable()71   bool GetSVMEnable() const { return svm_enable_; }
SetSVMEnable(bool enable)72   void SetSVMEnable(bool enable) { svm_enable_ = enable; }
GetWorkItemSize()73   const std::vector<size_t> &GetWorkItemSize() const { return max_work_item_sizes_; }
GetImagePitchAlignment()74   uint32_t GetImagePitchAlignment() const { return image_pitch_align_; }
GetSVMCapabilities()75   cl_device_svm_capabilities GetSVMCapabilities() const { return svm_enable_ ? svm_capabilities_ : 0; }
76 
77   template <typename T>
78   typename std::enable_if<std::is_pointer<T>::value, cl_int>::type SetKernelArg(const cl::Kernel &kernel,
79                                                                                 uint32_t index, const T value,
80                                                                                 bool force_buffer = false) {
81     if (value == nullptr) {
82       MS_LOG(ERROR) << "value is nullptr.";
83       return CL_INVALID_VALUE;
84     }
85     auto svm_capabilities = GetSVMCapabilities();
86     if (svm_capabilities) {
87       MS_LOG(DEBUG) << "Set kernel arg[" << index << "] SVM pointer " << value;
88       return clSetKernelArgSVMPointer(kernel.get(), index, value);
89     }
90     lite::opencl::MemType mem_type;
91     void *buffer = allocator_->GetOpenclMemPtr(value, &mem_type, force_buffer);
92     if (buffer == nullptr) {
93       MS_LOG(ERROR) << "buffer is nullptr.";
94       return CL_INVALID_VALUE;
95     }
96     MS_LOG(DEBUG) << "Set kernel arg[" << index << "] OpenCL "
97                   << (mem_type == lite::opencl::MemType::IMG ? "Image " : "Buffer ") << buffer
98                   << ", host_ptr: " << value;
99     if (mem_type == lite::opencl::MemType::IMG) {
100       return const_cast<cl::Kernel &>(kernel).setArg(index, *reinterpret_cast<cl::Image2D *>(buffer));
101     } else {
102       return const_cast<cl::Kernel &>(kernel).setArg(index, *reinterpret_cast<cl::Buffer *>(buffer));
103     }
104   }
105 
106   template <typename T>
SetKernelArg(const cl::Kernel & kernel,uint32_t index,const T value)107   typename std::enable_if<!std::is_pointer<T>::value, cl_int>::type SetKernelArg(const cl::Kernel &kernel,
108                                                                                  uint32_t index, const T value) {
109     return const_cast<cl::Kernel &>(kernel).setArg(index, value);
110   }
111 
112   cl::Program CreateProgramFromIL(const std::vector<char> &binary, const std::string &flag);
113   cl::Program CreateProgramFromBinary(const std::vector<unsigned char> &binary, const std::string &flag);
114   cl::Kernel GetKernelFromBinary(const std::string &kernel_name);
115   std::vector<unsigned char> GetProgramBinary(const cl::Program &program);
116   bool LoadSource(const std::string &program_name, const std::string &source);
117   int BuildKernel(const cl::Kernel &kernel, const std::string &program_name, const std::string &kernel_name,
118                   const std::vector<std::string> &build_options_ext = {}, const bool is_builtin = true);
119   int RunKernel(const cl::Kernel &kernel, const cl::NDRange &global, const cl::NDRange &local,
120                 cl::CommandQueue *command_queue = nullptr, cl::Event *event = nullptr);
121   int ReadOrWriteImage(void *buffer, void *data, bool is_read);
122   int ReadImage(void *buffer, void *dst_data);
123   int WriteImage(void *buffer, void *src_data);
124   bool CopyDeviceMemToHost(void *dst, const void *src, size_t size, cl::CommandQueue *command_queue = nullptr,
125                            bool sync = false) const;
126   bool CopyHostMemToDevice(const void *dst, const void *src, size_t size, cl::CommandQueue *command_queue = nullptr,
127                            bool sync = false) const;
128   void *MapBuffer(const cl::Buffer &buffer, int map_flags, size_t size, cl::CommandQueue *command_queue = nullptr,
129                   bool sync = false) const;
130   void *MapBuffer(const cl::Image2D &buffer, bool sync, int flags, const std::vector<size_t> &region,
131                   cl::CommandQueue *command_queue = nullptr) const;
132   int MapBuffer(void *host_ptr, int map_flags, size_t size, cl::CommandQueue *command_queue = nullptr,
133                 bool sync = false) const;
134   int UnmapBuffer(const cl::Memory &buffer, void *host_ptr, cl::CommandQueue *command_queue = nullptr) const;
135   int UnmapBuffer(void *host_ptr, cl::CommandQueue *command_queue = nullptr) const;
136   bool SyncCommandQueue(cl::CommandQueue *command_queue = nullptr);
137 
138   /**
139    * Get kernel max worker group size.
140    * @param kernel
141    * @param device_id
142    * @return max_work_group_size
143    */
144   int GetKernelMaxWorkGroupSize(cl_kernel kernel, cl_device_id device_id);
SetTuningMode(TuningMode mode)145   void SetTuningMode(TuningMode mode) { tuning_mode_ = mode; }
GetTuningMode()146   TuningMode GetTuningMode() const { return tuning_mode_; }
147 
isProfiling()148   bool isProfiling() const { return profiling_; }
SetProfiling(bool profiling)149   void SetProfiling(bool profiling) { profiling_ = profiling; }
isExtensionEnable(std::string ext)150   bool isExtensionEnable(std::string ext) { return supported_extensions_.find(ext) != std::string::npos; }
151   cl::Buffer *CreateSharedMemoryBuffer(size_t size, void *host_ptr);
GetCacheLineSize()152   size_t GetCacheLineSize() const { return cache_line_size_; }
153 
154  private:
155   static OpenCLRuntime *GetInstance();
156   static void DeleteInstance();
157   OpenCLRuntime() = default;
158   GpuInfo ParseGpuInfo(std::string device_name, std::string device_version);
159 
160   bool LoadProgram(const std::string &program_name, cl::Program *program);
161   bool BuildProgram(const std::string &build_options, const cl::Program &program);
162   int InitGPUDevice(std::vector<cl::Platform> *platforms);
163   int InitQueue(std::vector<cl::Platform> *platforms);
164 
165  private:
166   static InitState init_state_;
167   static size_t instance_count_;
168   static OpenCLRuntime *ocl_runtime_instance_;
169   cl::CommandQueue *default_command_queue_{nullptr};
170   cl::CommandQueue *profiling_command_queue_{nullptr};
171   cl::Context *context_{nullptr};
172   cl::Device *device_{nullptr};
173   std::shared_ptr<OpenCLAllocator> allocator_{nullptr};
174   std::map<std::pair<std::string, std::string>, cl::Program> program_map_;
175   cl::Program binary_program_;
176   uint64_t global_memery_cachesize_{0};
177   uint64_t global_memery_size_{0};
178   uint64_t max_alloc_size_{0};
179   uint64_t max_image2d_width_{0};
180   uint64_t max_image2d_height_{0};
181   uint64_t max_work_group_size_{1};
182   uint32_t compute_units_{0};
183   uint32_t max_freq_{0};
184   std::string default_build_option_{"-cl-mad-enable -cl-fast-relaxed-math -Werror"};
185   GpuInfo gpu_info_;
186   bool support_fp16_{false};
187   bool fp16_enable_{false};
188   bool svm_enable_{false};
189   cl_device_svm_capabilities svm_capabilities_{0};
190   cl_uint image_pitch_align_{0};
191   std::vector<size_t> max_work_item_sizes_;
192   void *handle_{nullptr};
193   TuningMode tuning_mode_{TuningMode::DEFAULT};
194 #if MS_OPENCL_PROFILE
195   bool profiling_{true};
196 #else
197   bool profiling_{false};
198   std::string supported_extensions_{""};
199   size_t cache_line_size_{1};
200 #endif
201   // for cache
202  private:
203   void LoadCache();
204   int StoreCache();
205 #ifdef MS_OPENCL_BINARY_CACHE
206   bool enable_cache_{true};
207 #else
208   bool enable_cache_{false};
209 #endif
210   bool flush_cache_{false};
211   std::string cache_path_{"/data/local/tmp/.opencl_cache"};
212   const std::string cache_version_{"V0.1"};
213 };
214 
215 class OpenCLRuntimeInnerWrapper {
216  public:
OpenCLRuntimeInnerWrapper()217   OpenCLRuntimeInnerWrapper() { ocl_runtime_ = OpenCLRuntime::GetInstance(); }
~OpenCLRuntimeInnerWrapper()218   ~OpenCLRuntimeInnerWrapper() { OpenCLRuntime::DeleteInstance(); }
219   OpenCLRuntimeInnerWrapper(const OpenCLRuntimeInnerWrapper &) = delete;
220   OpenCLRuntimeInnerWrapper &operator=(const OpenCLRuntimeInnerWrapper &) = delete;
GetInstance()221   OpenCLRuntime *GetInstance() { return ocl_runtime_; }
222 
223  private:
224   OpenCLRuntime *ocl_runtime_{nullptr};
225 };
226 }  // namespace mindspore::lite::opencl
227 #endif  // MINDSPORE_LITE_SRC_OPENCL_RUNTIME_H_
228