• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020-2022 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_OPENCL_KERNEL_H_
18 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_OPENCL_KERNEL_H_
19 #define MAX_PROFILING_TIME_MILLI_SECOND 10 * 1000  // 10 seconds
20 #include <vector>
21 #include <set>
22 #include <map>
23 #include <memory>
24 #include <string>
25 #include <cfloat>
26 #include "src/litert/lite_kernel.h"
27 #include "src/executor/kernel_exec.h"
28 #include "include/errorcode.h"
29 #include "src/litert/kernel/gpu/opencl/opencl_runtime.h"
30 #include "mindspore/lite/src/litert/tensor_category.h"
31 #include "src/litert/kernel/opencl/utils.h"
32 #include "nnacl/resize_parameter.h"
33 
34 using mindspore::lite::RET_ERROR;
35 using mindspore::lite::RET_OK;
36 
37 namespace mindspore::kernel {
38 constexpr int INPUT_TENSOR_SIZE_1 = 1;
39 constexpr int INPUT_TENSOR_SIZE_2 = 2;
40 constexpr int INPUT_TENSOR_SIZE_3 = 3;
41 constexpr int INPUT_TENSOR_SIZE_4 = 4;
42 constexpr int INPUT_TENSOR_SIZE_5 = 5;
43 constexpr int INPUT_TENSOR_SIZE_6 = 6;
44 constexpr int INPUT_TENSOR_SIZE_16 = 16;
45 constexpr int OUTPUT_TENSOR_SIZE_1 = 1;
46 constexpr int OUTPUT_TENSOR_SIZE_2 = 2;
47 constexpr int OUTPUT_TENSOR_SIZE_3 = 3;
48 constexpr int OUTPUT_TENSOR_SIZE_4 = 4;
49 
50 struct OpenCLToFormatParameter {
51   OpParameter op_parameter{};
52   lite::opencl::MemType out_mem_type{lite::opencl::MemType::IMG};
53 };
54 
55 struct OpenGLTexture2DToOpenCLParameter {
56   OpParameter op_parameter{};
57   lite::opencl::MemType out_mem_type{lite::opencl::MemType::IMG};
58 };
59 
60 template <typename SrcT, typename DstT>
Broadcast2GpuShape(const SrcT * src,int src_num,DstT * dst,int dsc_num)61 int Broadcast2GpuShape(const SrcT *src, int src_num, DstT *dst, int dsc_num) {
62   if (src == nullptr || src_num <= 0 || dst == nullptr || dsc_num < src_num) {
63     MS_LOG(WARNING) << "Broadcast2GpuShape invalid input";
64     return RET_ERROR;
65   }
66 
67   if (src_num == DIMENSION_1D) {  // 1 1 1 C
68     dst[kNHWC_C] = src[DIMENSION_0D];
69   } else if (src_num == DIMENSION_2D) {  // N 1 1 C
70     dst[kNHWC_N] = src[DIMENSION_0D];
71     dst[kNHWC_C] = src[DIMENSION_1D];
72   } else if (src_num == DIMENSION_3D) {  // N 1 W C
73     dst[kNHWC_N] = src[DIMENSION_0D];
74     dst[kNHWC_W] = src[DIMENSION_1D];
75     dst[kNHWC_C] = src[DIMENSION_2D];
76   } else if (src_num == DIMENSION_4D) {  // N H W C
77     dst[kNHWC_N] = src[DIMENSION_0D];
78     dst[kNHWC_H] = src[DIMENSION_1D];
79     dst[kNHWC_W] = src[DIMENSION_2D];
80     dst[kNHWC_C] = src[DIMENSION_3D];
81   } else if (src_num == DIMENSION_5D) {  // N D H W C
82     dst[kNDHWC_N] = src[DIMENSION_0D];
83     dst[kNDHWC_D] = src[DIMENSION_1D];
84     dst[kNDHWC_H] = src[DIMENSION_2D];
85     dst[kNDHWC_W] = src[DIMENSION_3D];
86     dst[kNDHWC_C] = src[DIMENSION_4D];
87   } else if (src_num > DIMENSION_5D) {
88     MS_LOG(WARNING) << "GPU doesn't support ndim>=" << src_num;
89     return RET_ERROR;
90   }
91 
92   return RET_OK;
93 }
94 
95 template <typename SrcT, typename DstT>
Broadcast2GpuShape(const SrcT * src,int src_num,DstT * dst,int dsc_num,DstT default_value)96 int Broadcast2GpuShape(const SrcT *src, int src_num, DstT *dst, int dsc_num, DstT default_value) {
97   if (dst == nullptr || dsc_num <= 0) {
98     MS_LOG(WARNING) << "Broadcast2GpuShape invalid input";
99     return RET_ERROR;
100   }
101   for (int i = 0; i < dsc_num; ++i) {
102     dst[i] = default_value;
103   }
104   if (src == nullptr || src_num <= 0) {
105     return RET_OK;
106   }
107 
108   return Broadcast2GpuShape(src, src_num, dst, dsc_num);
109 }
110 
111 int CpuAxis2GpuAxis(size_t ndim, int cpu_axis, int *gpu_axis);
112 
113 struct GpuTensorInfo {
114   GpuTensorInfo() = default;
GpuTensorInfoGpuTensorInfo115   explicit GpuTensorInfo(const lite::Tensor *tensor) {
116     auto ocl_runtime_wrap_ = lite::opencl::OpenCLRuntimeInnerWrapper();
117     if (tensor == nullptr) {
118       return;
119     }
120     auto shape_ori = tensor->shape();
121     NDim = shape_ori.size();
122     std::vector<size_t> shape_gpu(DIMENSION_5D);
123     (void)Broadcast2GpuShape(shape_ori.data(), NDim, shape_gpu.data(), DIMENSION_5D, (size_t)1);
124     if (NDim == DIMENSION_5D) {
125       N = shape_gpu[kNDHWC_N];
126       D = shape_gpu[kNDHWC_D];
127       H = shape_gpu[kNDHWC_H];
128       W = shape_gpu[kNDHWC_W];
129       C = shape_gpu[kNDHWC_C];
130     } else {
131       N = shape_gpu[kNHWC_N];
132       H = shape_gpu[kNHWC_H];
133       W = shape_gpu[kNHWC_W];
134       C = shape_gpu[kNHWC_C];
135     }
136 
137     MS_ASSERT(N > 0);
138     MS_ASSERT(D > 0);
139     MS_ASSERT(H > 0);
140     MS_ASSERT(W > 0);
141     MS_ASSERT(C > 0);
142     Slice = UP_DIV(C, C4NUM);
143 
144     FLT_size = tensor->data_type() == kNumberTypeFloat16 ? sizeof(cl_half) : sizeof(cl_float);
145     FLT4_size = FLT_size * C4NUM;
146     if (W * Slice <= ocl_runtime_wrap_.GetInstance()->GetMaxImage2DWidth()) {
147       height = N * D * H;
148       width = W * Slice;
149     } else {
150       height = N * D * H * W;
151       width = Slice;
152       if (height > ocl_runtime_wrap_.GetInstance()->GetMaxImage2DHeight()) {
153         height = -1;
154         width = -1;
155       }
156     }
157 
158     ElementsNum = N * D * H * W * C;
159     ElementsC4Num = N * D * H * W * Slice * C4NUM;
160     OriginSize = ElementsNum * FLT_size;
161     Image2DSize = height * width * FLT4_size;
162   }
163 
CreateGpuTensorInfoGpuTensorInfo164   static std::unique_ptr<GpuTensorInfo> CreateGpuTensorInfo(const lite::Tensor *tensor) {
165     if (tensor == nullptr) {
166       MS_LOG(WARNING) << "CreateGpuTensorInfo func's input tensor is nullptr";
167       return nullptr;
168     }
169 
170     auto gpu_tensor = std::make_unique<GpuTensorInfo>();
171     auto ocl_runtime_wrap_ = lite::opencl::OpenCLRuntimeInnerWrapper();
172 
173     auto shape_ori = tensor->shape();
174     gpu_tensor->NDim = shape_ori.size();
175     std::vector<size_t> shape_gpu(DIMENSION_5D);
176     auto ret = Broadcast2GpuShape(shape_ori.data(), gpu_tensor->NDim, shape_gpu.data(), DIMENSION_5D, (size_t)1);
177     if (ret != RET_OK) {
178       MS_LOG(WARNING) << "CreateGpuTensorInfo Broadcast2GpuShape failed";
179       return nullptr;
180     }
181 
182     if (gpu_tensor->NDim == DIMENSION_5D) {
183       gpu_tensor->N = shape_gpu[kNDHWC_N];
184       gpu_tensor->D = shape_gpu[kNDHWC_D];
185       gpu_tensor->H = shape_gpu[kNDHWC_H];
186       gpu_tensor->W = shape_gpu[kNDHWC_W];
187       gpu_tensor->C = shape_gpu[kNDHWC_C];
188     } else {
189       gpu_tensor->N = shape_gpu[kNHWC_N];
190       gpu_tensor->H = shape_gpu[kNHWC_H];
191       gpu_tensor->W = shape_gpu[kNHWC_W];
192       gpu_tensor->C = shape_gpu[kNHWC_C];
193     }
194 
195     MS_ASSERT(gpu_tensor->N > 0);
196     MS_ASSERT(gpu_tensor->D > 0);
197     MS_ASSERT(gpu_tensor->H > 0);
198     MS_ASSERT(gpu_tensor->W > 0);
199     MS_ASSERT(gpu_tensor->C > 0);
200     gpu_tensor->Slice = UP_DIV(gpu_tensor->C, C4NUM);
201 
202     gpu_tensor->FLT_size = tensor->data_type() == kNumberTypeFloat16 ? sizeof(cl_half) : sizeof(cl_float);
203     gpu_tensor->FLT4_size = gpu_tensor->FLT_size * C4NUM;
204     if (gpu_tensor->W * gpu_tensor->Slice <= ocl_runtime_wrap_.GetInstance()->GetMaxImage2DWidth()) {
205       gpu_tensor->height = gpu_tensor->N * gpu_tensor->D * gpu_tensor->H;
206       gpu_tensor->width = gpu_tensor->W * gpu_tensor->Slice;
207     } else {
208       gpu_tensor->height = gpu_tensor->N * gpu_tensor->D * gpu_tensor->H * gpu_tensor->W;
209       gpu_tensor->width = gpu_tensor->Slice;
210       if (gpu_tensor->height > ocl_runtime_wrap_.GetInstance()->GetMaxImage2DHeight()) {
211         gpu_tensor->height = -1;
212         gpu_tensor->width = -1;
213       }
214     }
215 
216     gpu_tensor->ElementsNum = gpu_tensor->N * gpu_tensor->D * gpu_tensor->H * gpu_tensor->W * gpu_tensor->C;
217     gpu_tensor->ElementsC4Num =
218       gpu_tensor->N * gpu_tensor->D * gpu_tensor->H * gpu_tensor->W * gpu_tensor->Slice * C4NUM;
219     gpu_tensor->OriginSize = gpu_tensor->ElementsNum * gpu_tensor->FLT_size;
220     gpu_tensor->Image2DSize = gpu_tensor->height * gpu_tensor->width * gpu_tensor->FLT4_size;
221 
222     return gpu_tensor;
223   }
224 
RowPitchGpuTensorInfo225   size_t RowPitch() const {
226     auto runtime_wrapper = lite::opencl::OpenCLRuntimeInnerWrapper();
227     int alignment = runtime_wrapper.GetInstance()->GetImagePitchAlignment();
228     MS_ASSERT(alignment);
229     size_t row_pitch = UP_ROUND(width, alignment) * FLT4_size;
230     return row_pitch;
231   }
232 
AlignAxisGpuTensorInfo233   int AlignAxis(int oriAxis) const {
234     const int axis = 3;
235     if (NDim == 0 || NDim == 1) {
236       return axis;
237     }
238     int no_neg_axis = static_cast<int>((oriAxis + NDim) % NDim);
239     if (no_neg_axis == 0) {
240       return 0;
241     }
242     return static_cast<int>(no_neg_axis + C4NUM - NDim);
243   }
244 
IsImageSizeValidGpuTensorInfo245   bool IsImageSizeValid() { return width > 0 && height > 0; }
246 
247   size_t N{1};
248   size_t D{1};
249   size_t H{1};
250   size_t W{1};
251   size_t C{1};
252   size_t Slice{};
253   size_t width{};
254   size_t height{};
255   size_t FLT_size{4};
256   size_t FLT4_size{16};
257   size_t ElementsNum{};
258   size_t ElementsC4Num{};
259   size_t OriginSize{};
260   size_t Image2DSize{};
261   size_t NDim{};
262 };
263 
264 struct BaseTuningParameter {
265   std::vector<size_t> local_size;
266   friend std::ostream &operator<<(std::ostream &ostrm, const BaseTuningParameter &a) {
267     ostrm << "LocalSize:";
268     for (auto i : a.local_size) {
269       ostrm << i << ",";
270     }
271     return ostrm;
272   }
273 };
274 class OpenCLKernel : public LiteKernel {
275  public:
OpenCLKernel(OpParameter * parameter,const std::vector<lite::Tensor * > & inputs,const std::vector<lite::Tensor * > & outputs,const lite::InnerContext * ctx)276   OpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
277                const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
278       : LiteKernel(parameter, inputs, outputs, ctx) {
279     ocl_runtime_ = ocl_runtime_wrap_.GetInstance();
280   }
281   ~OpenCLKernel() override = default;
282   void AlignGlobalLocal(const std::vector<size_t> &global, const std::vector<size_t> &local);
283 
Prepare()284   int Prepare() override { return RET_OK; }
285   int PreProcess() override;
286   int ReSize() override;
Run()287   int Run() override { return RET_ERROR; }
PostProcess()288   int PostProcess() override {
289     if (is_oversize_kernel_) {
290       return FreeInWorkTensor();
291     }
292     return RET_OK;
293   }
294 
295   bool MallocDataDone();
296   std::string OpenCLKernelHeader();
297 
298   virtual int CheckSpecs();
CheckSpecsWithoutShape()299   virtual int CheckSpecsWithoutShape() { return RET_OK; }
InitWeights()300   virtual int InitWeights() { return RET_OK; }
SetConstArgs()301   virtual int SetConstArgs() { return RET_OK; }
SetGlobalLocal()302   virtual int SetGlobalLocal() { return RET_OK; }
GetGlobalSize(size_t idx,std::vector<size_t> * global_size)303   virtual int GetGlobalSize(size_t idx, std::vector<size_t> *global_size) { return RET_ERROR; }
GetLocalSize(size_t idx,const std::vector<size_t> & global_size,std::vector<size_t> * local_size)304   virtual int GetLocalSize(size_t idx, const std::vector<size_t> &global_size, std::vector<size_t> *local_size) {
305     return RET_ERROR;
306   }
307   virtual std::vector<BaseTuningParameter> GenerateTuningParam();
308   virtual int AssignTuningParam(const BaseTuningParameter &param);
309   virtual int Tune();
StoreConstData()310   virtual int StoreConstData() { return RET_OK; }
DumpCode()311   virtual std::string DumpCode() { return "No source code generated!"; }
312 
313   int GetImageSize(size_t idx, lite::opencl::ImageSize *img_size);
314   void PrintOutput(int print_num = 10, const std::string &out_file = "");
GetMemType()315   lite::opencl::MemType GetMemType() { return out_mem_type_; }
SetMemType(lite::opencl::MemType mem_type)316   void SetMemType(lite::opencl::MemType mem_type) { out_mem_type_ = mem_type; }
GetParameter()317   OpParameter *GetParameter() { return op_parameter_; }
318   virtual double GetProfilingTimeMs();
319   int InferShape() override;
320 
321  protected:
322   void PrintShape(lite::Tensor *output_tensor);
323   static std::set<size_t> GenerateLocalByGlobal(size_t global_i);
324 
Key()325   virtual std::string Key() {
326     std::string key = schema::EnumNamePrimitiveType(type());
327     key += "_global";
328     for (auto i : global_size_) {
329       key += "_" + std::to_string(i);
330     }
331     return key;
332   }
333 
334  protected:
335   lite::opencl::OpenCLRuntime *ocl_runtime_;
336   lite::opencl::MemType out_mem_type_{lite::opencl::MemType::IMG};
337   cl::NDRange global_range_{cl::NullRange};
338   cl::NDRange local_range_{cl::NullRange};
339   std::vector<size_t> global_size_;
340   std::vector<size_t> local_size_;
341   cl::Kernel kernel_;
342   cl::Event event_;
343   void *restore_quant_data_{nullptr};
344   bool dequant_flag_{false};
345   bool is_oversize_kernel_{false};
346 
347  private:
348   lite::opencl::OpenCLRuntimeInnerWrapper ocl_runtime_wrap_;
349   static inline std::map<std::string, BaseTuningParameter> tuned_param_cache_;
350 };
351 
352 template <class T>
OpenCLKernelCreator(const std::vector<lite::Tensor * > & inputs,const std::vector<lite::Tensor * > & outputs,OpParameter * opParameter,const lite::InnerContext * ctx,const kernel::KernelKey & desc)353 kernel::LiteKernel *OpenCLKernelCreator(const std::vector<lite::Tensor *> &inputs,
354                                         const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
355                                         const lite::InnerContext *ctx, const kernel::KernelKey &desc) {
356   auto *kernel = new (std::nothrow) T(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs, ctx);
357   if (kernel == nullptr) {
358     MS_LOG(WARNING) << "kernel " << opParameter->name_ << "is nullptr.";
359     return nullptr;
360   }
361 
362   auto ret = kernel->CheckSpecsWithoutShape();
363   if (ret != mindspore::lite::RET_OK) {
364     MS_LOG(WARNING) << "Check " << opParameter->name_ << " specification Without shape failed!";
365     delete kernel;
366     return nullptr;
367   }
368 
369   auto shape = outputs.front()->shape();
370   if (std::find(shape.begin(), shape.end(), -1) != shape.end()) {
371     MS_LOG(WARNING) << "kernel " << opParameter->name_ << "don't infer shape yet!";
372     return kernel;
373   }
374   if (std::find(shape.begin(), shape.end(), 0) != shape.end()) {
375     MS_LOG(WARNING) << "kernel " << opParameter->name_ << "don't support output shape has zero.";
376     return nullptr;
377   }
378   ret = kernel->CheckSpecs();
379   if (ret != mindspore::lite::RET_OK) {
380     MS_LOG(WARNING) << "Check " << opParameter->name_ << " specification failed!";
381     delete kernel;
382     return nullptr;
383   }
384   ret = kernel->OpenCLKernel::CheckSpecs();
385   if (ret != mindspore::lite::RET_OK) {
386     MS_LOG(WARNING) << "Check " << opParameter->name_ << " specification failed!";
387     delete kernel;
388     return nullptr;
389   }
390   ret = reinterpret_cast<OpenCLKernel *>(kernel)->StoreConstData();
391   if (ret != mindspore::lite::RET_OK) {
392     MS_LOG(WARNING) << "Store " << opParameter->name_ << " const data failed!";
393     delete kernel;
394     return nullptr;
395   }
396   return kernel;
397 }
398 }  // namespace mindspore::kernel
399 
400 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_OPENCL_KERNEL_H_
401