• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef MINDSPORE_LITE_SRC_OPENCL_KERNEL_H_
18 #define MINDSPORE_LITE_SRC_OPENCL_KERNEL_H_
19 #define MAX_PROFILING_TIME_MILLI_SECOND 10 * 1000  // 10 seconds
20 #include <vector>
21 #include <set>
22 #include <map>
23 #include <string>
24 #include "src/inner_kernel.h"
25 #include "include/errorcode.h"
26 #include "src/runtime/gpu/opencl/opencl_runtime.h"
27 #include "mindspore/lite/src/weight_decoder.h"
28 #include "src/runtime/kernel/opencl/utils.h"
29 #include "nnacl/resize_parameter.h"
30 
31 using mindspore::lite::RET_ERROR;
32 using mindspore::lite::RET_OK;
33 
34 namespace mindspore::kernel {
35 constexpr int INPUT_TENSOR_SIZE_1 = 1;
36 constexpr int INPUT_TENSOR_SIZE_2 = 2;
37 constexpr int INPUT_TENSOR_SIZE_3 = 3;
38 constexpr int INPUT_TENSOR_SIZE_4 = 4;
39 constexpr int INPUT_TENSOR_SIZE_5 = 5;
40 constexpr int INPUT_TENSOR_SIZE_6 = 6;
41 constexpr int OUTPUT_TENSOR_SIZE_1 = 1;
42 constexpr int OUTPUT_TENSOR_SIZE_2 = 2;
43 constexpr int OUTPUT_TENSOR_SIZE_3 = 3;
44 constexpr int OUTPUT_TENSOR_SIZE_4 = 4;
45 
46 struct OpenCLToFormatParameter {
47   OpParameter op_parameter{};
48   lite::opencl::MemType out_mem_type{lite::opencl::MemType::IMG};
49 };
50 
51 template <typename SrcT, typename DstT>
Broadcast2GpuShape(DstT * dst,const SrcT * src,int src_num)52 void Broadcast2GpuShape(DstT *dst, const SrcT *src, int src_num) {
53   MS_ASSERT(dst);
54   if (src == nullptr || src_num <= 0) {
55     return;
56   }
57   auto *N = dst;
58   auto *H = dst + 1;
59   auto *W = dst + 2;
60   auto *C = dst + 3;
61   if (src_num == 1) {  // 1 1 1 C
62     *C = src[0];
63   } else if (src_num == 2) {  // N 1 1 C
64     *N = src[0];
65     *C = src[1];
66   } else if (src_num == 3) {  // N 1 W C
67     *N = src[0];
68     *W = src[1];
69     *C = src[2];
70   } else if (src_num == 4) {  // N H W C
71     *N = src[0];
72     *H = src[1];
73     *W = src[2];
74     *C = src[3];
75   } else if (src_num > 4) {
76     MS_LOG(ERROR) << "GPU doesn't support ndim>=" << src_num;
77   }
78 }
79 
80 template <typename SrcT, typename DstT>
Broadcast2GpuShape(DstT * dst,const SrcT * src,int src_num,DstT default_value)81 void Broadcast2GpuShape(DstT *dst, const SrcT *src, int src_num, DstT default_value) {
82   MS_ASSERT(dst);
83   for (int i = 0; i < 4; ++i) {
84     dst[i] = default_value;
85   }
86   if (src == nullptr || src_num <= 0) {
87     return;
88   }
89   Broadcast2GpuShape(dst, src, src_num);
90 }
91 
92 struct GpuTensorInfo {
93   GpuTensorInfo() = default;
GpuTensorInfoGpuTensorInfo94   explicit GpuTensorInfo(const lite::Tensor *tensor) {
95     auto ocl_runtime_wrap_ = lite::opencl::OpenCLRuntimeInnerWrapper();
96     if (tensor == nullptr) {
97       return;
98     }
99     auto shape_ori = tensor->shape();
100     NDim = shape_ori.size();
101     cl_int4 shape;
102     Broadcast2GpuShape(shape.s, shape_ori.data(), shape_ori.size(), 1);
103     N = shape.s[0];
104     H = shape.s[1];
105     W = shape.s[2];
106     C = shape.s[3];
107     MS_ASSERT(N > 0);
108     MS_ASSERT(H > 0);
109     MS_ASSERT(W > 0);
110     MS_ASSERT(C > 0);
111     Slice = UP_DIV(C, C4NUM);
112 
113     FLT_size = tensor->data_type() == kNumberTypeFloat16 ? sizeof(cl_half) : sizeof(cl_float);
114     FLT4_size = FLT_size * 4;
115     if (W * Slice <= ocl_runtime_wrap_.GetInstance()->GetMaxImage2DWidth()) {
116       height = N * H;
117       width = W * Slice;
118     } else {
119       height = N * H * W;
120       width = Slice;
121       if (height > ocl_runtime_wrap_.GetInstance()->GetMaxImage2DHeight()) {
122         height = -1;
123         width = -1;
124       }
125     }
126 
127     ElementsNum = N * H * W * C;
128     ElementsC4Num = N * H * W * Slice * C4NUM;
129     OriginSize = ElementsNum * FLT_size;
130     Image2DSize = height * width * FLT4_size;
131   }
132 
RowPitchGpuTensorInfo133   size_t RowPitch() const {
134     auto runtime_wrapper = lite::opencl::OpenCLRuntimeInnerWrapper();
135     int alignment = runtime_wrapper.GetInstance()->GetImagePitchAlignment();
136     MS_ASSERT(alignment);
137     size_t row_pitch = UP_ROUND(width, alignment) * FLT4_size;
138     return row_pitch;
139   }
140 
AlignAxisGpuTensorInfo141   int AlignAxis(int oriAxis) const {
142     if (NDim == 0 || NDim == 1) {
143       return 3;
144     }
145     int no_neg_axis = static_cast<int>((oriAxis + NDim) % NDim);
146     if (no_neg_axis == 0) {
147       return 0;
148     }
149     return static_cast<int>(no_neg_axis + 4 - NDim);
150   }
151 
IsImageSizeValidGpuTensorInfo152   bool IsImageSizeValid() { return width > 0 && height > 0; }
153 
154   size_t N{1};
155   size_t H{1};
156   size_t W{1};
157   size_t C{1};
158   size_t Slice{};
159   size_t width{};
160   size_t height{};
161   size_t FLT_size{4};
162   size_t FLT4_size{16};
163   size_t ElementsNum{};
164   size_t ElementsC4Num{};
165   size_t OriginSize{};
166   size_t Image2DSize{};
167   size_t NDim{};
168 };
169 
170 struct BaseTuningParameter {
171   std::vector<size_t> local_size;
172   friend std::ostream &operator<<(std::ostream &ostrm, const BaseTuningParameter &a) {
173     ostrm << "LocalSize:";
174     for (auto i : a.local_size) {
175       ostrm << i << ",";
176     }
177     return ostrm;
178   }
179 };
180 class OpenCLKernel : public InnerKernel {
181  public:
OpenCLKernel(OpParameter * parameter,const std::vector<lite::Tensor * > & inputs,const std::vector<lite::Tensor * > & outputs,const lite::InnerContext * ctx)182   OpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
183                const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx)
184       : InnerKernel(parameter, inputs, outputs, ctx) {
185     ocl_runtime_ = ocl_runtime_wrap_.GetInstance();
186   }
187   ~OpenCLKernel() override = default;
188   void AlignGlobalLocal(const std::vector<size_t> &global, const std::vector<size_t> &local);
189 
Prepare()190   int Prepare() override { return RET_OK; }
191   int PreProcess() override;
192   int ReSize() override;
Run()193   int Run() override { return RET_ERROR; }
194 
195   virtual int CheckSpecs();
InitWeights()196   virtual int InitWeights() { return RET_OK; }
SetConstArgs()197   virtual int SetConstArgs() { return RET_OK; }
SetGlobalLocal()198   virtual void SetGlobalLocal() {}
GetGlobalSize(size_t idx,std::vector<size_t> * global_size)199   virtual int GetGlobalSize(size_t idx, std::vector<size_t> *global_size) { return RET_ERROR; }
GetLocalSize(size_t idx,const std::vector<size_t> & global_size,std::vector<size_t> * local_size)200   virtual int GetLocalSize(size_t idx, const std::vector<size_t> &global_size, std::vector<size_t> *local_size) {
201     return RET_ERROR;
202   }
203   virtual std::vector<BaseTuningParameter> GenerateTuningParam();
204   virtual int AssignTuningParam(const BaseTuningParameter &param);
205   virtual int Tune();
StoreConstData()206   virtual int StoreConstData() { return RET_OK; }
207 
208   int GetImageSize(size_t idx, lite::opencl::ImageSize *img_size);
209   void PrintOutput(int print_num = 10, const std::string &out_file = "");
GetMemType()210   lite::opencl::MemType GetMemType() { return out_mem_type_; }
SetMemType(lite::opencl::MemType mem_type)211   void SetMemType(lite::opencl::MemType mem_type) { out_mem_type_ = mem_type; }
GetParameter()212   OpParameter *GetParameter() { return op_parameter_; }
213   virtual double GetProfilingTimeMs();
214   virtual int InferShape();
215 
216  protected:
217   static std::set<size_t> GenerateLocalByGlobal(size_t global_i);
218 
Key()219   virtual std::string Key() {
220     std::string key = schema::EnumNamePrimitiveType(type());
221     key += "_global";
222     for (auto i : global_size_) {
223       key += "_" + std::to_string(i);
224     }
225     return key;
226   }
227 
228  protected:
229   lite::opencl::OpenCLRuntime *ocl_runtime_;
230   lite::opencl::MemType out_mem_type_{lite::opencl::MemType::IMG};
231   cl::NDRange global_range_{cl::NullRange};
232   cl::NDRange local_range_{cl::NullRange};
233   std::vector<size_t> global_size_;
234   std::vector<size_t> local_size_;
235   cl::Kernel kernel_;
236   cl::Event event_;
237   void *restore_quant_data_{nullptr};
238   bool dequant_flag_{false};
239 
240  private:
241   lite::opencl::OpenCLRuntimeInnerWrapper ocl_runtime_wrap_;
242   static inline std::map<std::string, BaseTuningParameter> tuned_param_cache_;
243 };
244 template <class T>
OpenCLKernelCreator(const std::vector<lite::Tensor * > & inputs,const std::vector<lite::Tensor * > & outputs,OpParameter * opParameter,const lite::Context * ctx,const kernel::KernelKey & desc)245 kernel::InnerKernel *OpenCLKernelCreator(const std::vector<lite::Tensor *> &inputs,
246                                          const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
247                                          const lite::Context *ctx, const kernel::KernelKey &desc) {
248   auto *kernel = new (std::nothrow)
249     T(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs, static_cast<const lite::InnerContext *>(ctx));
250   if (kernel == nullptr) {
251     MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr.";
252     free(opParameter);
253     return nullptr;
254   }
255   auto shape = outputs.front()->shape();
256   if (std::find(shape.begin(), shape.end(), -1) != shape.end()) {
257     MS_LOG(WARNING) << "kernel " << opParameter->name_ << "don't infer shape yet!";
258     return kernel;
259   }
260   if (std::find(shape.begin(), shape.end(), 0) != shape.end()) {
261     MS_LOG(WARNING) << "kernel " << opParameter->name_ << "don't support output shape has zero.";
262     return nullptr;
263   }
264   auto ret = kernel->CheckSpecs();
265   if (ret != mindspore::lite::RET_OK) {
266     MS_LOG(ERROR) << "Check " << opParameter->name_ << " specification failed!";
267     delete kernel;
268     return nullptr;
269   }
270   ret = kernel->OpenCLKernel::CheckSpecs();
271   if (ret != mindspore::lite::RET_OK) {
272     MS_LOG(ERROR) << "Check " << opParameter->name_ << " specification failed!";
273     delete kernel;
274     return nullptr;
275   }
276   ret = reinterpret_cast<OpenCLKernel *>(kernel)->StoreConstData();
277   if (ret != mindspore::lite::RET_OK) {
278     MS_LOG(ERROR) << "Store " << opParameter->name_ << " const data failed!";
279     delete kernel;
280     return nullptr;
281   }
282   return kernel;
283 }
284 }  // namespace mindspore::kernel
285 
286 #endif  // MINDSPORE_LITE_SRC_OPENCL_KERNEL_H_
287