• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020-2022 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "src/litert/kernel/opencl/utils.h"
18 #include <fstream>
19 #include <algorithm>
20 #include <vector>
21 #include <map>
22 #include "src/litert/kernel_registry.h"
23 #include "src/common/file_utils.h"
24 
25 using mindspore::schema::ActivationType_LEAKY_RELU;
26 using mindspore::schema::ActivationType_RELU;
27 using mindspore::schema::ActivationType_RELU6;
28 using mindspore::schema::ActivationType_SIGMOID;
29 using mindspore::schema::ActivationType_TANH;
30 
31 namespace mindspore::kernel {
32 const std::set<schema::PrimitiveType> ArithmeticPrimitives = {schema::PrimitiveType_MulFusion,
33                                                               schema::PrimitiveType_AddFusion,
34                                                               schema::PrimitiveType_SubFusion,
35                                                               schema::PrimitiveType_DivFusion,
36                                                               schema::PrimitiveType_LogicalAnd,
37                                                               schema::PrimitiveType_LogicalOr,
38                                                               schema::PrimitiveType_Maximum,
39                                                               schema::PrimitiveType_Minimum,
40                                                               schema::PrimitiveType_FloorDiv,
41                                                               schema::PrimitiveType_FloorMod,
42                                                               schema::PrimitiveType_SquaredDifference,
43                                                               schema::PrimitiveType_Equal,
44                                                               schema::PrimitiveType_NotEqual,
45                                                               schema::PrimitiveType_Less,
46                                                               schema::PrimitiveType_LessEqual,
47                                                               schema::PrimitiveType_Greater,
48                                                               schema::PrimitiveType_GreaterEqual,
49                                                               schema::PrimitiveType_Eltwise,
50                                                               schema::PrimitiveType_BiasAdd};
51 
52 const std::set<schema::PrimitiveType> ArithmeticSelfPrimitives = {
53   schema::PrimitiveType_Abs,        schema::PrimitiveType_Ceil,  schema::PrimitiveType_Cos,
54   schema::PrimitiveType_ExpFusion,  schema::PrimitiveType_Floor, schema::PrimitiveType_Log,
55   schema::PrimitiveType_LogicalNot, schema::PrimitiveType_Round, schema::PrimitiveType_Rsqrt,
56   schema::PrimitiveType_Sin,        schema::PrimitiveType_Neg,   schema::PrimitiveType_Sqrt,
57   schema::PrimitiveType_Square};
58 
GetActDefines()59 std::string GetActDefines() {
60   static std::string act_defines = "#define ActivationType_RELU " + std::to_string(ActivationType_RELU) +
61                                    "\n#define ActivationType_RELU6 " + std::to_string(ActivationType_RELU6) +
62                                    "\n#define ActivationType_LEAKY_RELU " + std::to_string(ActivationType_LEAKY_RELU) +
63                                    "\n#define ActivationType_TANH " + std::to_string(ActivationType_TANH) +
64                                    "\n#define ActivationType_SIGMOID " + std::to_string(ActivationType_SIGMOID) + "\n";
65   return act_defines;
66 }
67 
GetUpPow2(int n)68 int GetUpPow2(int n) {
69   int i = 0;
70   int j = 0;
71   while (n > 0) {
72     j += n & 1;
73     n = n >> 1;
74     i++;
75   }
76   return 1 << (i - (j == 1));
77 }
78 
GetMaxDivisor(int x,int divisor)79 int GetMaxDivisor(int x, int divisor) {
80   int i = divisor;
81   while (i > 0) {
82     if (x % i == 0) {
83       return i;
84     }
85     i--;
86   }
87   return 1;
88 }
89 
GetMaxDivisorStrategy0(int x,int divisor)90 int GetMaxDivisorStrategy0(int x, int divisor) {
91   if (divisor >= C8NUM && x % C8NUM == 0) {
92     return C8NUM;
93   } else if (divisor >= C4NUM && x % C4NUM == 0) {
94     return C4NUM;
95   } else if (divisor >= C2NUM && x % C2NUM == 0) {
96     return C2NUM;
97   } else {
98     return GetMaxDivisor(x, divisor);
99   }
100 }
101 
GetMaxDivisorStrategy1(int x,int divisor)102 int GetMaxDivisorStrategy1(int x, int divisor) {
103   if (divisor >= C8NUM && x % C8NUM == 0) {
104     return x / C8NUM;
105   } else if (divisor >= C4NUM && x % C4NUM == 0) {
106     return x / C4NUM;
107   } else if (divisor >= C2NUM && x % C2NUM == 0) {
108     return x / C2NUM;
109   } else {
110     return GetMaxDivisor(x, divisor);
111   }
112 }
113 
114 std::map<cl_int, std::string> error_infos = {
115   {CL_SUCCESS, "Success"},
116   {CL_DEVICE_NOT_FOUND, "Device not found"},
117   {CL_DEVICE_NOT_AVAILABLE, "Device not available"},
118   {CL_COMPILER_NOT_AVAILABLE, "Compiler not available"},
119   {CL_MEM_OBJECT_ALLOCATION_FAILURE, "Memory object allocation failure"},
120   {CL_OUT_OF_RESOURCES, "Out of resources"},
121   {CL_OUT_OF_HOST_MEMORY, "Out of host memory"},
122   {CL_PROFILING_INFO_NOT_AVAILABLE, "Profiling information not available"},
123   {CL_MEM_COPY_OVERLAP, "Memory copy overlap"},
124   {CL_IMAGE_FORMAT_MISMATCH, "Image format mismatch"},
125   {CL_IMAGE_FORMAT_NOT_SUPPORTED, "Image format not supported"},
126   {CL_BUILD_PROGRAM_FAILURE, "Build program failure"},
127   {CL_MAP_FAILURE, "Mapping failure"},
128   {CL_MISALIGNED_SUB_BUFFER_OFFSET, "Misaligned sub-buffer offset"},
129   {CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST, "Execution status error for events in wait list"},
130   {CL_COMPILE_PROGRAM_FAILURE, "Compile program failure"},
131   {CL_LINKER_NOT_AVAILABLE, "Linker not available"},
132   {CL_LINK_PROGRAM_FAILURE, "Link program failure"},
133   {CL_DEVICE_PARTITION_FAILED, "Device partition failed"},
134   {CL_KERNEL_ARG_INFO_NOT_AVAILABLE, "Kernel argument information not available"},
135   {CL_INVALID_VALUE, "Invalid value"},
136   {CL_INVALID_DEVICE_TYPE, "Invalid device type"},
137   {CL_INVALID_PLATFORM, "Invalid platform"},
138   {CL_INVALID_DEVICE, "Invalid device"},
139   {CL_INVALID_CONTEXT, "Invalid context"},
140   {CL_INVALID_QUEUE_PROPERTIES, "Invalid queue properties"},
141   {CL_INVALID_COMMAND_QUEUE, "Invalid command queue"},
142   {CL_INVALID_HOST_PTR, "Invalid host pointer"},
143   {CL_INVALID_MEM_OBJECT, "Invalid memory object"},
144   {CL_INVALID_IMAGE_FORMAT_DESCRIPTOR, "Invalid image format descriptor"},
145   {CL_INVALID_IMAGE_SIZE, "Invalid image size"},
146   {CL_INVALID_SAMPLER, "Invalid sampler"},
147   {CL_INVALID_BINARY, "Invalid binary"},
148   {CL_INVALID_BUILD_OPTIONS, "Invalid build options"},
149   {CL_INVALID_PROGRAM, "Invalid program"},
150   {CL_INVALID_PROGRAM_EXECUTABLE, "Invalid program executable"},
151   {CL_INVALID_KERNEL_NAME, "Invalid kernel name"},
152   {CL_INVALID_KERNEL_DEFINITION, "Invalid kernel definition"},
153   {CL_INVALID_KERNEL, "Invalid kernel"},
154   {CL_INVALID_ARG_INDEX, "Invalid argument index"},
155   {CL_INVALID_ARG_VALUE, "Invalid argument value"},
156   {CL_INVALID_ARG_SIZE, "Invalid argument size"},
157   {CL_INVALID_KERNEL_ARGS, "Invalid kernel arguments"},
158   {CL_INVALID_WORK_DIMENSION, "Invalid work dimension"},
159   {CL_INVALID_WORK_GROUP_SIZE, "Invalid work group size"},
160   {CL_INVALID_WORK_ITEM_SIZE, "Invalid work item size"},
161   {CL_INVALID_GLOBAL_OFFSET, "Invalid global offset"},
162   {CL_INVALID_EVENT_WAIT_LIST, "Invalid event wait list"},
163   {CL_INVALID_EVENT, "Invalid event"},
164   {CL_INVALID_OPERATION, "Invalid operation"},
165   {CL_INVALID_GL_OBJECT, "Invalid GL object"},
166   {CL_INVALID_BUFFER_SIZE, "Invalid buffer size"},
167   {CL_INVALID_MIP_LEVEL, "Invalid mip-level"},
168   {CL_INVALID_GLOBAL_WORK_SIZE, "Invalid global work size"},
169   {CL_INVALID_PROPERTY, "Invalid property"},
170   {CL_INVALID_IMAGE_DESCRIPTOR, "Invalid image descriptor"},
171   {CL_INVALID_COMPILER_OPTIONS, "Invalid compiler options"},
172   {CL_INVALID_LINKER_OPTIONS, "Invalid linker options"},
173   {CL_INVALID_DEVICE_PARTITION_COUNT, "Invalid device partition count"},
174   {CL_INVALID_PIPE_SIZE, "Invalid pipe size"},
175   {CL_INVALID_DEVICE_QUEUE, "Invalid device queue"},
176   {CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR, "Invalid GL share group reference KHR"}};
177 
CLErrorCode(cl_int error_code)178 std::string CLErrorCode(cl_int error_code) {
179   auto it = error_infos.find(error_code);
180   if (it == error_infos.end()) {
181     return "Unknown OpenCL error code";
182   } else {
183     return it->second;
184   }
185 }
186 
GetBroadcastGpuAxis(int ndim,int ori_axis)187 int GetBroadcastGpuAxis(int ndim, int ori_axis) {
188   if (ori_axis >= ndim) {
189     return ndim - 1;
190   }
191   int axis = 0;
192   if (ndim == DIMENSION_1D) {
193     axis = kNHWC_C;
194   } else if (ndim == DIMENSION_2D) {
195     axis = ori_axis == kNHWC_N ? kNHWC_N : kNHWC_C;
196   } else if (ndim == DIMENSION_3D) {
197     axis = ori_axis == kNHWC_N ? kNHWC_N : ori_axis == kNHWC_H ? kNHWC_W : kNHWC_C;
198   } else if (ndim == DIMENSION_4D) {
199     axis = ori_axis;
200   } else if (ndim > DIMENSION_4D) {
201     MS_LOG(ERROR) << "GPU doesn't support ndim>=" << ndim;
202   }
203   return axis;
204 }
205 
206 #ifdef ENABLE_FP16
PackNHWCToNHWC4(void * src,void * dst,bool src_is_fp16,bool dst_is_fp16,const GpuTensorInfo & tensor,int data_type)207 void PackNHWCToNHWC4(void *src, void *dst, bool src_is_fp16, bool dst_is_fp16, const GpuTensorInfo &tensor,
208                      int data_type) {
209   MS_ASSERT(src);
210   MS_ASSERT(dst);
211   auto src_fp16 = reinterpret_cast<float16_t *>(src);
212   auto src_fp32 = reinterpret_cast<float32_t *>(src);
213   auto src_int32 = reinterpret_cast<int32_t *>(src);
214   auto dst_fp16 = reinterpret_cast<float16_t *>(dst);
215   auto dst_fp32 = reinterpret_cast<float32_t *>(dst);
216   auto dst_int32 = reinterpret_cast<int32_t *>(dst);
217   for (int n = 0, src_idx = 0; n < tensor.N; n++) {
218     for (int h = 0; h < tensor.D * tensor.H; ++h) {
219       for (int w = 0; w < tensor.W; ++w) {
220         for (int c = 0; c < tensor.C; ++c, ++src_idx) {
221           int dst_idx = ((n * tensor.D * tensor.H + h) * tensor.W + w) * tensor.Slice * C4NUM + c;
222           if (data_type == kNumberTypeInt32) {
223             dst_int32[dst_idx] = src_int32[src_idx];
224           } else if (dst_is_fp16) {
225             dst_fp16[dst_idx] = src_is_fp16 ? src_fp16[src_idx] : static_cast<float16_t>(src_fp32[src_idx]);
226           } else {
227             dst_fp32[dst_idx] = src_is_fp16 ? static_cast<float32_t>(src_fp16[src_idx]) : src_fp32[src_idx];
228           }
229         }
230       }
231     }
232   }
233   // scalar
234   if (tensor.ElementsNum == 1) {
235     if (dst_is_fp16) {
236       dst_fp16[kNHWC_C] = dst_fp16[kNHWC_W] = dst_fp16[kNHWC_H] = dst_fp16[kNHWC_N];
237     } else {
238       dst_fp32[kNHWC_C] = dst_fp32[kNHWC_W] = dst_fp32[kNHWC_H] = dst_fp32[kNHWC_N];
239     }
240   }
241 }
242 #else
PackNHWCToNHWC4(void * src,void * dst,bool src_is_fp16,bool dst_is_fp16,const GpuTensorInfo & tensor,int data_type)243 void PackNHWCToNHWC4(void *src, void *dst, bool src_is_fp16, bool dst_is_fp16, const GpuTensorInfo &tensor,
244                      int data_type) {
245   MS_ASSERT(dst);
246   MS_ASSERT(src);
247   auto src_fp32 = reinterpret_cast<float *>(src);
248   auto src_int32 = reinterpret_cast<int32_t *>(src);
249   auto dst_fp32 = reinterpret_cast<float *>(dst);
250   auto dst_int32 = reinterpret_cast<int32_t *>(dst);
251   for (size_t n = 0, src_idx = 0; n < tensor.N; n++) {
252     for (size_t h = 0; h < tensor.D * tensor.H; ++h) {
253       for (size_t w = 0; w < tensor.W; ++w) {
254         for (size_t c = 0; c < tensor.C; ++c, ++src_idx) {
255           int dst_idx = ((n * tensor.D * tensor.H + h) * tensor.W + w) * tensor.Slice * C4NUM + c;
256           if (data_type == kNumberTypeInt32) {
257             dst_int32[dst_idx] = src_int32[src_idx];
258           } else {
259             dst_fp32[dst_idx] = src_fp32[src_idx];
260           }
261         }
262       }
263     }
264   }
265   // scalar
266   if (tensor.ElementsNum == 1) {
267     dst_fp32[kNHWC_C] = dst_fp32[kNHWC_W] = dst_fp32[kNHWC_H] = dst_fp32[kNHWC_N];
268   }
269 }
270 #endif
271 
272 #ifdef ENABLE_FP16
PackNCHWToNHWC4(void * src,void * dst,bool src_is_fp16,bool dst_is_fp16,const GpuTensorInfo & tensor,int data_type)273 void PackNCHWToNHWC4(void *src, void *dst, bool src_is_fp16, bool dst_is_fp16, const GpuTensorInfo &tensor,
274                      int data_type) {
275   MS_ASSERT(src);
276   MS_ASSERT(dst);
277   auto src_int32 = reinterpret_cast<int32_t *>(src);
278   auto src_fp32 = reinterpret_cast<float32_t *>(src);
279   auto src_fp16 = reinterpret_cast<float16_t *>(src);
280   auto dst_int32 = reinterpret_cast<int32_t *>(dst);
281   auto dst_fp32 = reinterpret_cast<float32_t *>(dst);
282   auto dst_fp16 = reinterpret_cast<float16_t *>(dst);
283   for (int src_idx = 0, n = 0; n < tensor.N; n++) {
284     for (int c = 0; c < tensor.C; ++c) {
285       for (int h = 0; h < tensor.D * tensor.H; ++h) {
286         for (int w = 0; w < tensor.W; ++w, ++src_idx) {
287           int dst_idx = ((n * tensor.D * tensor.H + h) * tensor.W + w) * tensor.Slice * C4NUM + c;
288           if (data_type == kNumberTypeInt32) {
289             dst_int32[dst_idx] = src_int32[src_idx];
290           } else if (dst_is_fp16) {
291             dst_fp16[dst_idx] = src_is_fp16 ? src_fp16[src_idx] : static_cast<float16_t>(src_fp32[src_idx]);
292           } else {
293             dst_fp32[dst_idx] = src_is_fp16 ? static_cast<float32_t>(src_fp16[src_idx]) : src_fp32[src_idx];
294           }
295         }
296       }
297     }
298   }
299   // scalar
300   if (tensor.ElementsNum == 1) {
301     if (dst_is_fp16) {
302       dst_fp16[kNHWC_N] = dst_fp16[kNHWC_H] = dst_fp16[kNHWC_W] = dst_fp16[kNHWC_C];
303     } else {
304       dst_fp32[kNHWC_N] = dst_fp32[kNHWC_H] = dst_fp32[kNHWC_W] = dst_fp32[kNHWC_C];
305     }
306   }
307 }
308 #else
PackNCHWToNHWC4(void * src,void * dst,bool src_is_fp16,bool dst_is_fp16,const GpuTensorInfo & tensor,int data_type)309 void PackNCHWToNHWC4(void *src, void *dst, bool src_is_fp16, bool dst_is_fp16, const GpuTensorInfo &tensor,
310                      int data_type) {
311   MS_ASSERT(src);
312   MS_ASSERT(dst);
313   auto src_fp32 = reinterpret_cast<float *>(src);
314   auto src_int32 = reinterpret_cast<int32_t *>(src);
315   auto dst_fp32 = reinterpret_cast<float *>(dst);
316   auto dst_int32 = reinterpret_cast<int32_t *>(dst);
317   for (size_t n = 0, src_idx = 0; n < tensor.N; n++) {
318     for (size_t c = 0; c < tensor.C; ++c) {
319       for (size_t h = 0; h < tensor.D * tensor.H; ++h) {
320         for (size_t w = 0; w < tensor.W; ++w, ++src_idx) {
321           int dst_idx = ((n * tensor.D * tensor.H + h) * tensor.W + w) * tensor.Slice * C4NUM + c;
322           if (data_type == kNumberTypeInt32) {
323             dst_int32[dst_idx] = src_int32[src_idx];
324           } else {
325             dst_fp32[dst_idx] = src_fp32[src_idx];
326           }
327         }
328       }
329     }
330   }
331   // scalar
332   if (tensor.ElementsNum == 1) {
333     dst_fp32[kNHWC_C] = dst_fp32[kNHWC_W] = dst_fp32[kNHWC_H] = dst_fp32[kNHWC_N];
334   }
335 }
336 #endif
337 
CheckParamLikeTensor(const std::string & kernel_name,const std::string & tensor_name,lite::Tensor * tensor,TypeId expect_data_type,const std::vector<int> & expect_shape)338 int CheckParamLikeTensor(const std::string &kernel_name, const std::string &tensor_name, lite::Tensor *tensor,
339                          TypeId expect_data_type, const std::vector<int> &expect_shape) {
340   if (!tensor->IsConst()) {
341     MS_LOG(WARNING) << "in " << kernel_name << ": tensor " << tensor_name << " must be Const.";
342     return RET_ERROR;
343   }
344   if (tensor->data_type() != expect_data_type) {
345     MS_LOG(WARNING) << "in " << kernel_name << ": tensor's data_type must be " << expect_data_type;
346     return RET_ERROR;
347   }
348   if (tensor->shape() != expect_shape) {
349     std::string expect_shape_str = "(";
350     for (auto i : expect_shape) {
351       expect_shape_str += std::to_string(i) + ",";
352     }
353     expect_shape_str += ")";
354 
355     std::string tensor_shape_str = "(";
356     for (auto i : tensor->shape()) {
357       tensor_shape_str += std::to_string(i) + ",";
358     }
359     tensor_shape_str += ")";
360 
361     MS_LOG(WARNING) << "in " << kernel_name
362                     << ": tensor's shape is error. expect_shape: " + expect_shape_str +
363                          " tensor->shape(): " + tensor_shape_str;
364     return RET_ERROR;
365   }
366   return RET_OK;
367 }
368 
StoreTensorData(lite::Tensor * tensor)369 void *StoreTensorData(lite::Tensor *tensor) {
370   if ((tensor != nullptr) && (tensor->data() != nullptr) && (tensor->Size() > 0)) {
371     void *stored_data = malloc(tensor->Size());
372     if (stored_data == nullptr) {
373       MS_LOG(ERROR) << "StoreTensorData Malloc Failed.";
374       return nullptr;
375     }
376     memcpy(stored_data, tensor->data(), tensor->Size());
377     return stored_data;
378   }
379   return nullptr;
380 }
381 
FreeStoredData(void * data)382 void FreeStoredData(void *data) {
383   if (data != nullptr) {
384     free(data);
385   }
386 }
387 
CreateBuildOptionsExtByDType(TypeId type_id)388 std::vector<std::string> CreateBuildOptionsExtByDType(TypeId type_id) {
389   std::vector<std::string> build_options_ext;
390   if (type_id == kNumberTypeInt32) {
391     build_options_ext = {" -DDTYPE=int -DDTYPE4=int4 -DWRITE_IMAGE=write_imagei  -DREAD_IMAGE=read_imagei "};
392   } else if (type_id == kNumberTypeFloat32) {
393     build_options_ext = {" -DDTYPE=float -DDTYPE4=float4 -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "};
394   } else if (type_id == kNumberTypeFloat16) {
395     build_options_ext = {" -DDTYPE=half -DDTYPE4=half4 -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "};
396   }
397   return build_options_ext;
398 }
399 }  // namespace mindspore::kernel
400