1 /** 2 * Copyright 2019 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_GPU_OPENCL_OPENCL_RUNTIME_H_ 18 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_GPU_OPENCL_OPENCL_RUNTIME_H_ 19 #include <vector> 20 #include <map> 21 #include <memory> 22 #include <set> 23 #include <string> 24 #include <utility> 25 #include <type_traits> 26 #include "dtype/type_id.h" 27 #include "src/common/log_adapter.h" 28 #include "src/litert/kernel/gpu/opencl/opencl_wrapper.h" 29 #include "src/litert/kernel/gpu/opencl/opencl_allocator.h" 30 #include "schema/gpu_cache_generated.h" 31 #define EXT_ARM_IMPORT_MEMORY_HOST "cl_arm_import_memory_host" 32 33 namespace mindspore::lite::opencl { 34 enum GpuType { OTHER = 0, ADRENO = 1, MALI = 2, MALI_T = 3, MALI_G = 4, MALI_G78 = 5 }; 35 enum TuningMode { DEFAULT = 0, FAST = 1, EXTREME = 2 }; 36 enum InitState { UnInit = 0, InitSuccess = 1, InitFailed = 2 }; 37 38 struct GpuInfo { 39 GpuType type = OTHER; 40 }; 41 class OpenCLRuntimeInnerWrapper; 42 class OpenCLRuntimeWrapper; 43 class OpenCLRuntime { 44 public: 45 friend OpenCLRuntimeInnerWrapper; 46 friend OpenCLRuntimeWrapper; 47 ~OpenCLRuntime(); 48 OpenCLRuntime(const OpenCLRuntime &) = delete; 49 OpenCLRuntime &operator=(const OpenCLRuntime &) = delete; 50 51 int Init(); 52 int Uninit(); 53 54 cl::Context *Context(); 55 cl::Device *Device(); GetAllocator()56 std::shared_ptr<OpenCLAllocator> GetAllocator() { return allocator_; } GetDefaultCommandQueue()57 cl::CommandQueue *GetDefaultCommandQueue() { return profiling_ ? profiling_command_queue_ : default_command_queue_; } 58 uint64_t DeviceGlobalMemoryCacheSize() const; 59 uint64_t DeviceMaxWorkGroupSize() const; 60 uint32_t DeviceComputeUnits() const; 61 uint32_t DeviceMaxFreq() const; 62 uint64_t GetMaxWorkGroupSize(const cl::Kernel &kernel); 63 uint32_t GetSubGroupSize(const cl::Kernel &kernel, const cl::NDRange &range = cl::NullRange); GetGlobalMemSize()64 uint64_t GetGlobalMemSize() { return global_memery_size_; } GetMaxAllocSize()65 uint64_t GetMaxAllocSize() { return max_alloc_size_; } GetMaxImage2DWidth()66 uint64_t GetMaxImage2DWidth() { return max_image2d_width_; } GetMaxImage2DHeight()67 uint64_t GetMaxImage2DHeight() { return max_image2d_height_; } 68 GpuInfo GetGpuInfo(); 69 bool GetFp16Enable() const; 70 bool SetFp16Enable(bool enable); 71 bool GetGLTextureEnable() const; 72 bool SetGLTextureEnable(bool enable); 73 SetGLContext(void * gl_context)74 void SetGLContext(void *gl_context) { gl_context_ = gl_context; } GetGLContext()75 void *GetGLContext() const { return gl_context_; } CheckGLContext()76 bool CheckGLContext() const { return (GetGLContext() != nullptr); } 77 SetGLDisplay(void * gl_display)78 void SetGLDisplay(void *gl_display) { gl_display_ = gl_display; } GetGLDisplay()79 void *GetGLDisplay() const { return gl_display_; } CheckGLDisplay()80 bool CheckGLDisplay() const { return (GetGLDisplay() != nullptr); } 81 GetSVMEnable()82 bool GetSVMEnable() const { return svm_enable_; } SetSVMEnable(bool enable)83 void SetSVMEnable(bool enable) { svm_enable_ = enable; } GetWorkItemSize()84 const std::vector<size_t> &GetWorkItemSize() const { return max_work_item_sizes_; } GetImagePitchAlignment()85 uint32_t GetImagePitchAlignment() const { return image_pitch_align_; } GetSVMCapabilities()86 cl_device_svm_capabilities GetSVMCapabilities() const { return svm_enable_ ? svm_capabilities_ : 0; } 87 88 template <typename T> 89 typename std::enable_if<std::is_pointer<T>::value, cl_int>::type SetKernelArg(const cl::Kernel &kernel, 90 uint32_t index, const T value, 91 bool force_buffer = false) { 92 if (value == nullptr) { 93 MS_LOG(ERROR) << "value is nullptr."; 94 return CL_INVALID_VALUE; 95 } 96 auto svm_capabilities = GetSVMCapabilities(); 97 if (svm_capabilities) { 98 MS_LOG(DEBUG) << "Set kernel arg[" << index << "] SVM pointer " << value; 99 return clSetKernelArgSVMPointer(kernel.get(), index, value); 100 } 101 lite::opencl::MemType mem_type; 102 void *buffer = allocator_->GetOpenclMemPtr(value, &mem_type, force_buffer); 103 if (buffer == nullptr) { 104 MS_LOG(ERROR) << "buffer is nullptr."; 105 return CL_INVALID_VALUE; 106 } 107 MS_LOG(DEBUG) << "Set kernel arg[" << index << "] OpenCL " 108 << (mem_type == lite::opencl::MemType::IMG ? "Image " : "Buffer ") << buffer 109 << ", host_ptr: " << value; 110 if (mem_type == lite::opencl::MemType::IMG) { 111 return const_cast<cl::Kernel &>(kernel).setArg(index, *reinterpret_cast<cl::Image2D *>(buffer)); 112 } else { 113 return const_cast<cl::Kernel &>(kernel).setArg(index, *reinterpret_cast<cl::Buffer *>(buffer)); 114 } 115 } 116 117 template <typename T> SetKernelArg(const cl::Kernel & kernel,uint32_t index,const T value)118 typename std::enable_if<!std::is_pointer<T>::value, cl_int>::type SetKernelArg(const cl::Kernel &kernel, 119 uint32_t index, const T value) { 120 return const_cast<cl::Kernel &>(kernel).setArg(index, value); 121 } 122 123 cl::Program CreateProgramFromIL(const std::vector<char> &binary, const std::string &flag); 124 cl::Program CreateProgramFromBinary(const std::vector<unsigned char> &binary, const std::string &flag); 125 cl::Kernel GetKernelFromBinary(const std::string &kernel_name); 126 std::vector<unsigned char> GetProgramBinary(const cl::Program &program); 127 bool LoadSource(const std::string &program_name, const std::string &source); 128 int BuildKernel(const cl::Kernel &kernel, const std::string &program_name, const std::string &kernel_name, 129 const std::vector<std::string> &build_options_ext = {}, const bool is_builtin = true); 130 int RunKernel(const cl::Kernel &kernel, const cl::NDRange &global, const cl::NDRange &local, 131 cl::CommandQueue *command_queue = nullptr, cl::Event *event = nullptr); 132 int ReadOrWriteImage(void *buffer, void *data, bool is_read); 133 int ReadImage(void *buffer, void *dst_data); 134 int WriteImage(void *buffer, void *src_data); 135 bool CopyDeviceMemToHost(void *dst, const void *src, size_t size, cl::CommandQueue *command_queue = nullptr, 136 bool sync = false) const; 137 bool CopyHostMemToDevice(const void *dst, const void *src, size_t size, cl::CommandQueue *command_queue = nullptr, 138 bool sync = false) const; 139 void *MapBuffer(const cl::Buffer &buffer, int map_flags, size_t size, cl::CommandQueue *command_queue = nullptr, 140 bool sync = false) const; 141 void *MapBuffer(const cl::Image2D &buffer, bool sync, int flags, const std::vector<size_t> ®ion, 142 cl::CommandQueue *command_queue = nullptr) const; 143 int MapBuffer(void *host_ptr, int map_flags, size_t size, cl::CommandQueue *command_queue = nullptr, 144 bool sync = false) const; 145 int UnmapBuffer(const cl::Memory &buffer, void *host_ptr, cl::CommandQueue *command_queue = nullptr) const; 146 int UnmapBuffer(void *host_ptr, cl::CommandQueue *command_queue = nullptr) const; 147 bool SyncCommandQueue(cl::CommandQueue *command_queue = nullptr); 148 149 /** 150 * Get kernel max worker group size. 151 * @param kernel 152 * @param device_id 153 * @return max_work_group_size 154 */ 155 int GetKernelMaxWorkGroupSize(cl_kernel kernel, cl_device_id device_id); SetTuningMode(TuningMode mode)156 void SetTuningMode(TuningMode mode) { tuning_mode_ = mode; } GetTuningMode()157 TuningMode GetTuningMode() const { return tuning_mode_; } 158 isProfiling()159 bool isProfiling() const { return profiling_; } SetProfiling(bool profiling)160 void SetProfiling(bool profiling) { profiling_ = profiling; } isExtensionEnable(std::string ext)161 bool isExtensionEnable(std::string ext) { return supported_extensions_.find(ext) != std::string::npos; } 162 cl::Buffer *CreateSharedMemoryBuffer(size_t size, void *host_ptr); GetCacheLineSize()163 size_t GetCacheLineSize() const { return cache_line_size_; } 164 165 private: 166 static OpenCLRuntime *GetInstance(); 167 static void DeleteInstance(); 168 OpenCLRuntime() = default; 169 GpuInfo ParseGpuInfo(std::string device_name, std::string device_version); 170 171 bool LoadProgram(const std::string &program_name, cl::Program *program); 172 bool BuildProgram(const std::string &build_options, const cl::Program &program); 173 int InitGPUDevice(std::vector<cl::Platform> *platforms); 174 int InitQueue(std::vector<cl::Platform> *platforms); 175 176 private: 177 static InitState init_state_; 178 static size_t instance_count_; 179 static OpenCLRuntime *ocl_runtime_instance_; 180 cl::CommandQueue *default_command_queue_{nullptr}; 181 cl::CommandQueue *profiling_command_queue_{nullptr}; 182 cl::Context *context_{nullptr}; 183 cl::Device *device_{nullptr}; 184 std::shared_ptr<OpenCLAllocator> allocator_{nullptr}; 185 std::map<std::pair<std::string, std::string>, cl::Program> program_map_; 186 cl::Program binary_program_; 187 uint64_t global_memery_cachesize_{0}; 188 uint64_t global_memery_size_{0}; 189 uint64_t max_alloc_size_{0}; 190 uint64_t max_image2d_width_{0}; 191 uint64_t max_image2d_height_{0}; 192 uint64_t max_work_group_size_{1}; 193 uint32_t compute_units_{0}; 194 uint32_t max_freq_{0}; 195 std::string default_build_option_{"-cl-mad-enable -cl-fast-relaxed-math -Werror"}; 196 GpuInfo gpu_info_; 197 bool support_fp16_{false}; 198 bool fp16_enable_{false}; 199 bool svm_enable_{false}; 200 cl_device_svm_capabilities svm_capabilities_{0}; 201 cl_uint image_pitch_align_{0}; 202 std::vector<size_t> max_work_item_sizes_; 203 void *handle_{nullptr}; 204 bool enable_gl_texture_{false}; 205 void *gl_context_{nullptr}; 206 void *gl_display_{nullptr}; 207 TuningMode tuning_mode_{TuningMode::DEFAULT}; 208 #if MS_OPENCL_PROFILE 209 bool profiling_{true}; 210 #else 211 bool profiling_{false}; 212 std::string supported_extensions_{""}; 213 size_t cache_line_size_{1}; 214 #endif 215 // for cache 216 private: 217 void LoadCache(); 218 int StoreCache(); 219 #ifdef MS_OPENCL_BINARY_CACHE 220 bool enable_cache_{true}; 221 #else 222 bool enable_cache_{false}; 223 #endif 224 bool flush_cache_{false}; 225 std::string cache_path_{"/data/local/tmp/.opencl_cache"}; 226 const std::string cache_version_{"V0.1"}; 227 }; 228 229 class OpenCLRuntimeInnerWrapper { 230 public: OpenCLRuntimeInnerWrapper()231 OpenCLRuntimeInnerWrapper() { ocl_runtime_ = OpenCLRuntime::GetInstance(); } ~OpenCLRuntimeInnerWrapper()232 ~OpenCLRuntimeInnerWrapper() { OpenCLRuntime::DeleteInstance(); } 233 OpenCLRuntimeInnerWrapper(const OpenCLRuntimeInnerWrapper &) = delete; 234 OpenCLRuntimeInnerWrapper &operator=(const OpenCLRuntimeInnerWrapper &) = delete; GetInstance()235 OpenCLRuntime *GetInstance() { return ocl_runtime_; } 236 237 private: 238 OpenCLRuntime *ocl_runtime_{nullptr}; 239 }; 240 } // namespace mindspore::lite::opencl 241 #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_GPU_OPENCL_OPENCL_RUNTIME_H_ 242