1 /** 2 * Copyright 2019 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef MINDSPORE_LITE_SRC_OPENCL_RUNTIME_H_ 18 #define MINDSPORE_LITE_SRC_OPENCL_RUNTIME_H_ 19 #include <vector> 20 #include <map> 21 #include <memory> 22 #include <set> 23 #include <string> 24 #include <utility> 25 #include <type_traits> 26 #include "ir/dtype/type_id.h" 27 #include "src/common/log_adapter.h" 28 #include "src/runtime/gpu/opencl/opencl_wrapper.h" 29 #include "src/runtime/gpu/opencl/opencl_allocator.h" 30 #include "schema/gpu_cache_generated.h" 31 #define EXT_ARM_IMPORT_MEMORY_HOST "cl_arm_import_memory_host" 32 33 namespace mindspore::lite::opencl { 34 enum GpuType { OTHER = 0, ADRENO = 1, MALI = 2, MALI_T = 3, MALI_G = 4 }; 35 enum TuningMode { DEFAULT = 0, FAST = 1, EXTREME = 2 }; 36 enum InitState { UnInit = 0, InitSuccess = 1, InitFailed = 2 }; 37 38 struct GpuInfo { 39 GpuType type = OTHER; 40 }; 41 class OpenCLRuntimeInnerWrapper; 42 class OpenCLRuntimeWrapper; 43 class OpenCLRuntime { 44 public: 45 friend OpenCLRuntimeInnerWrapper; 46 friend OpenCLRuntimeWrapper; 47 ~OpenCLRuntime(); 48 OpenCLRuntime(const OpenCLRuntime &) = delete; 49 OpenCLRuntime &operator=(const OpenCLRuntime &) = delete; 50 51 int Init(); 52 int Uninit(); 53 54 cl::Context *Context(); 55 cl::Device *Device(); GetAllocator()56 std::shared_ptr<OpenCLAllocator> GetAllocator() { return allocator_; } GetDefaultCommandQueue()57 cl::CommandQueue *GetDefaultCommandQueue() { return profiling_ ? profiling_command_queue_ : default_command_queue_; } 58 uint64_t DeviceGlobalMemoryCacheSize() const; 59 uint64_t DeviceMaxWorkGroupSize() const; 60 uint32_t DeviceComputeUnits() const; 61 uint32_t DeviceMaxFreq() const; 62 uint64_t GetMaxWorkGroupSize(const cl::Kernel &kernel); 63 uint32_t GetSubGroupSize(const cl::Kernel &kernel, const cl::NDRange &range = cl::NullRange); GetGlobalMemSize()64 uint64_t GetGlobalMemSize() { return global_memery_size_; } GetMaxAllocSize()65 uint64_t GetMaxAllocSize() { return max_alloc_size_; } GetMaxImage2DWidth()66 uint64_t GetMaxImage2DWidth() { return max_image2d_width_; } GetMaxImage2DHeight()67 uint64_t GetMaxImage2DHeight() { return max_image2d_height_; } 68 GpuInfo GetGpuInfo(); 69 bool GetFp16Enable() const; 70 bool SetFp16Enable(bool enable); GetSVMEnable()71 bool GetSVMEnable() const { return svm_enable_; } SetSVMEnable(bool enable)72 void SetSVMEnable(bool enable) { svm_enable_ = enable; } GetWorkItemSize()73 const std::vector<size_t> &GetWorkItemSize() const { return max_work_item_sizes_; } GetImagePitchAlignment()74 uint32_t GetImagePitchAlignment() const { return image_pitch_align_; } GetSVMCapabilities()75 cl_device_svm_capabilities GetSVMCapabilities() const { return svm_enable_ ? svm_capabilities_ : 0; } 76 77 template <typename T> 78 typename std::enable_if<std::is_pointer<T>::value, cl_int>::type SetKernelArg(const cl::Kernel &kernel, 79 uint32_t index, const T value, 80 bool force_buffer = false) { 81 if (value == nullptr) { 82 MS_LOG(ERROR) << "value is nullptr."; 83 return CL_INVALID_VALUE; 84 } 85 auto svm_capabilities = GetSVMCapabilities(); 86 if (svm_capabilities) { 87 MS_LOG(DEBUG) << "Set kernel arg[" << index << "] SVM pointer " << value; 88 return clSetKernelArgSVMPointer(kernel.get(), index, value); 89 } 90 lite::opencl::MemType mem_type; 91 void *buffer = allocator_->GetOpenclMemPtr(value, &mem_type, force_buffer); 92 if (buffer == nullptr) { 93 MS_LOG(ERROR) << "buffer is nullptr."; 94 return CL_INVALID_VALUE; 95 } 96 MS_LOG(DEBUG) << "Set kernel arg[" << index << "] OpenCL " 97 << (mem_type == lite::opencl::MemType::IMG ? "Image " : "Buffer ") << buffer 98 << ", host_ptr: " << value; 99 if (mem_type == lite::opencl::MemType::IMG) { 100 return const_cast<cl::Kernel &>(kernel).setArg(index, *reinterpret_cast<cl::Image2D *>(buffer)); 101 } else { 102 return const_cast<cl::Kernel &>(kernel).setArg(index, *reinterpret_cast<cl::Buffer *>(buffer)); 103 } 104 } 105 106 template <typename T> SetKernelArg(const cl::Kernel & kernel,uint32_t index,const T value)107 typename std::enable_if<!std::is_pointer<T>::value, cl_int>::type SetKernelArg(const cl::Kernel &kernel, 108 uint32_t index, const T value) { 109 return const_cast<cl::Kernel &>(kernel).setArg(index, value); 110 } 111 112 cl::Program CreateProgramFromIL(const std::vector<char> &binary, const std::string &flag); 113 cl::Program CreateProgramFromBinary(const std::vector<unsigned char> &binary, const std::string &flag); 114 cl::Kernel GetKernelFromBinary(const std::string &kernel_name); 115 std::vector<unsigned char> GetProgramBinary(const cl::Program &program); 116 bool LoadSource(const std::string &program_name, const std::string &source); 117 int BuildKernel(const cl::Kernel &kernel, const std::string &program_name, const std::string &kernel_name, 118 const std::vector<std::string> &build_options_ext = {}, const bool is_builtin = true); 119 int RunKernel(const cl::Kernel &kernel, const cl::NDRange &global, const cl::NDRange &local, 120 cl::CommandQueue *command_queue = nullptr, cl::Event *event = nullptr); 121 int ReadOrWriteImage(void *buffer, void *data, bool is_read); 122 int ReadImage(void *buffer, void *dst_data); 123 int WriteImage(void *buffer, void *src_data); 124 bool CopyDeviceMemToHost(void *dst, const void *src, size_t size, cl::CommandQueue *command_queue = nullptr, 125 bool sync = false) const; 126 bool CopyHostMemToDevice(const void *dst, const void *src, size_t size, cl::CommandQueue *command_queue = nullptr, 127 bool sync = false) const; 128 void *MapBuffer(const cl::Buffer &buffer, int map_flags, size_t size, cl::CommandQueue *command_queue = nullptr, 129 bool sync = false) const; 130 void *MapBuffer(const cl::Image2D &buffer, bool sync, int flags, const std::vector<size_t> ®ion, 131 cl::CommandQueue *command_queue = nullptr) const; 132 int MapBuffer(void *host_ptr, int map_flags, size_t size, cl::CommandQueue *command_queue = nullptr, 133 bool sync = false) const; 134 int UnmapBuffer(const cl::Memory &buffer, void *host_ptr, cl::CommandQueue *command_queue = nullptr) const; 135 int UnmapBuffer(void *host_ptr, cl::CommandQueue *command_queue = nullptr) const; 136 bool SyncCommandQueue(cl::CommandQueue *command_queue = nullptr); 137 138 /** 139 * Get kernel max worker group size. 140 * @param kernel 141 * @param device_id 142 * @return max_work_group_size 143 */ 144 int GetKernelMaxWorkGroupSize(cl_kernel kernel, cl_device_id device_id); SetTuningMode(TuningMode mode)145 void SetTuningMode(TuningMode mode) { tuning_mode_ = mode; } GetTuningMode()146 TuningMode GetTuningMode() const { return tuning_mode_; } 147 isProfiling()148 bool isProfiling() const { return profiling_; } SetProfiling(bool profiling)149 void SetProfiling(bool profiling) { profiling_ = profiling; } isExtensionEnable(std::string ext)150 bool isExtensionEnable(std::string ext) { return supported_extensions_.find(ext) != std::string::npos; } 151 cl::Buffer *CreateSharedMemoryBuffer(size_t size, void *host_ptr); GetCacheLineSize()152 size_t GetCacheLineSize() const { return cache_line_size_; } 153 154 private: 155 static OpenCLRuntime *GetInstance(); 156 static void DeleteInstance(); 157 OpenCLRuntime() = default; 158 GpuInfo ParseGpuInfo(std::string device_name, std::string device_version); 159 160 bool LoadProgram(const std::string &program_name, cl::Program *program); 161 bool BuildProgram(const std::string &build_options, const cl::Program &program); 162 int InitGPUDevice(std::vector<cl::Platform> *platforms); 163 int InitQueue(std::vector<cl::Platform> *platforms); 164 165 private: 166 static InitState init_state_; 167 static size_t instance_count_; 168 static OpenCLRuntime *ocl_runtime_instance_; 169 cl::CommandQueue *default_command_queue_{nullptr}; 170 cl::CommandQueue *profiling_command_queue_{nullptr}; 171 cl::Context *context_{nullptr}; 172 cl::Device *device_{nullptr}; 173 std::shared_ptr<OpenCLAllocator> allocator_{nullptr}; 174 std::map<std::pair<std::string, std::string>, cl::Program> program_map_; 175 cl::Program binary_program_; 176 uint64_t global_memery_cachesize_{0}; 177 uint64_t global_memery_size_{0}; 178 uint64_t max_alloc_size_{0}; 179 uint64_t max_image2d_width_{0}; 180 uint64_t max_image2d_height_{0}; 181 uint64_t max_work_group_size_{1}; 182 uint32_t compute_units_{0}; 183 uint32_t max_freq_{0}; 184 std::string default_build_option_{"-cl-mad-enable -cl-fast-relaxed-math -Werror"}; 185 GpuInfo gpu_info_; 186 bool support_fp16_{false}; 187 bool fp16_enable_{false}; 188 bool svm_enable_{false}; 189 cl_device_svm_capabilities svm_capabilities_{0}; 190 cl_uint image_pitch_align_{0}; 191 std::vector<size_t> max_work_item_sizes_; 192 void *handle_{nullptr}; 193 TuningMode tuning_mode_{TuningMode::DEFAULT}; 194 #if MS_OPENCL_PROFILE 195 bool profiling_{true}; 196 #else 197 bool profiling_{false}; 198 std::string supported_extensions_{""}; 199 size_t cache_line_size_{1}; 200 #endif 201 // for cache 202 private: 203 void LoadCache(); 204 int StoreCache(); 205 #ifdef MS_OPENCL_BINARY_CACHE 206 bool enable_cache_{true}; 207 #else 208 bool enable_cache_{false}; 209 #endif 210 bool flush_cache_{false}; 211 std::string cache_path_{"/data/local/tmp/.opencl_cache"}; 212 const std::string cache_version_{"V0.1"}; 213 }; 214 215 class OpenCLRuntimeInnerWrapper { 216 public: OpenCLRuntimeInnerWrapper()217 OpenCLRuntimeInnerWrapper() { ocl_runtime_ = OpenCLRuntime::GetInstance(); } ~OpenCLRuntimeInnerWrapper()218 ~OpenCLRuntimeInnerWrapper() { OpenCLRuntime::DeleteInstance(); } 219 OpenCLRuntimeInnerWrapper(const OpenCLRuntimeInnerWrapper &) = delete; 220 OpenCLRuntimeInnerWrapper &operator=(const OpenCLRuntimeInnerWrapper &) = delete; GetInstance()221 OpenCLRuntime *GetInstance() { return ocl_runtime_; } 222 223 private: 224 OpenCLRuntime *ocl_runtime_{nullptr}; 225 }; 226 } // namespace mindspore::lite::opencl 227 #endif // MINDSPORE_LITE_SRC_OPENCL_RUNTIME_H_ 228