1 /**
2 * Copyright 2020-2022 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "src/litert/kernel/opencl/utils.h"
18 #include <fstream>
19 #include <algorithm>
20 #include <vector>
21 #include <map>
22 #include "src/litert/kernel_registry.h"
23 #include "src/common/file_utils.h"
24
25 using mindspore::schema::ActivationType_LEAKY_RELU;
26 using mindspore::schema::ActivationType_RELU;
27 using mindspore::schema::ActivationType_RELU6;
28 using mindspore::schema::ActivationType_SIGMOID;
29 using mindspore::schema::ActivationType_TANH;
30
31 namespace mindspore::kernel {
32 const std::set<schema::PrimitiveType> ArithmeticPrimitives = {schema::PrimitiveType_MulFusion,
33 schema::PrimitiveType_AddFusion,
34 schema::PrimitiveType_SubFusion,
35 schema::PrimitiveType_DivFusion,
36 schema::PrimitiveType_LogicalAnd,
37 schema::PrimitiveType_LogicalOr,
38 schema::PrimitiveType_Maximum,
39 schema::PrimitiveType_Minimum,
40 schema::PrimitiveType_FloorDiv,
41 schema::PrimitiveType_FloorMod,
42 schema::PrimitiveType_SquaredDifference,
43 schema::PrimitiveType_Equal,
44 schema::PrimitiveType_NotEqual,
45 schema::PrimitiveType_Less,
46 schema::PrimitiveType_LessEqual,
47 schema::PrimitiveType_Greater,
48 schema::PrimitiveType_GreaterEqual,
49 schema::PrimitiveType_Eltwise,
50 schema::PrimitiveType_BiasAdd};
51
52 const std::set<schema::PrimitiveType> ArithmeticSelfPrimitives = {
53 schema::PrimitiveType_Abs, schema::PrimitiveType_Ceil, schema::PrimitiveType_Cos,
54 schema::PrimitiveType_ExpFusion, schema::PrimitiveType_Floor, schema::PrimitiveType_Log,
55 schema::PrimitiveType_LogicalNot, schema::PrimitiveType_Round, schema::PrimitiveType_Rsqrt,
56 schema::PrimitiveType_Sin, schema::PrimitiveType_Neg, schema::PrimitiveType_Sqrt,
57 schema::PrimitiveType_Square};
58
GetActDefines()59 std::string GetActDefines() {
60 static std::string act_defines = "#define ActivationType_RELU " + std::to_string(ActivationType_RELU) +
61 "\n#define ActivationType_RELU6 " + std::to_string(ActivationType_RELU6) +
62 "\n#define ActivationType_LEAKY_RELU " + std::to_string(ActivationType_LEAKY_RELU) +
63 "\n#define ActivationType_TANH " + std::to_string(ActivationType_TANH) +
64 "\n#define ActivationType_SIGMOID " + std::to_string(ActivationType_SIGMOID) + "\n";
65 return act_defines;
66 }
67
GetUpPow2(int n)68 int GetUpPow2(int n) {
69 int i = 0;
70 int j = 0;
71 while (n > 0) {
72 j += n & 1;
73 n = n >> 1;
74 i++;
75 }
76 return 1 << (i - (j == 1));
77 }
78
GetMaxDivisor(int x,int divisor)79 int GetMaxDivisor(int x, int divisor) {
80 int i = divisor;
81 while (i > 0) {
82 if (x % i == 0) {
83 return i;
84 }
85 i--;
86 }
87 return 1;
88 }
89
GetMaxDivisorStrategy0(int x,int divisor)90 int GetMaxDivisorStrategy0(int x, int divisor) {
91 if (divisor >= C8NUM && x % C8NUM == 0) {
92 return C8NUM;
93 } else if (divisor >= C4NUM && x % C4NUM == 0) {
94 return C4NUM;
95 } else if (divisor >= C2NUM && x % C2NUM == 0) {
96 return C2NUM;
97 } else {
98 return GetMaxDivisor(x, divisor);
99 }
100 }
101
GetMaxDivisorStrategy1(int x,int divisor)102 int GetMaxDivisorStrategy1(int x, int divisor) {
103 if (divisor >= C8NUM && x % C8NUM == 0) {
104 return x / C8NUM;
105 } else if (divisor >= C4NUM && x % C4NUM == 0) {
106 return x / C4NUM;
107 } else if (divisor >= C2NUM && x % C2NUM == 0) {
108 return x / C2NUM;
109 } else {
110 return GetMaxDivisor(x, divisor);
111 }
112 }
113
114 std::map<cl_int, std::string> error_infos = {
115 {CL_SUCCESS, "Success"},
116 {CL_DEVICE_NOT_FOUND, "Device not found"},
117 {CL_DEVICE_NOT_AVAILABLE, "Device not available"},
118 {CL_COMPILER_NOT_AVAILABLE, "Compiler not available"},
119 {CL_MEM_OBJECT_ALLOCATION_FAILURE, "Memory object allocation failure"},
120 {CL_OUT_OF_RESOURCES, "Out of resources"},
121 {CL_OUT_OF_HOST_MEMORY, "Out of host memory"},
122 {CL_PROFILING_INFO_NOT_AVAILABLE, "Profiling information not available"},
123 {CL_MEM_COPY_OVERLAP, "Memory copy overlap"},
124 {CL_IMAGE_FORMAT_MISMATCH, "Image format mismatch"},
125 {CL_IMAGE_FORMAT_NOT_SUPPORTED, "Image format not supported"},
126 {CL_BUILD_PROGRAM_FAILURE, "Build program failure"},
127 {CL_MAP_FAILURE, "Mapping failure"},
128 {CL_MISALIGNED_SUB_BUFFER_OFFSET, "Misaligned sub-buffer offset"},
129 {CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST, "Execution status error for events in wait list"},
130 {CL_COMPILE_PROGRAM_FAILURE, "Compile program failure"},
131 {CL_LINKER_NOT_AVAILABLE, "Linker not available"},
132 {CL_LINK_PROGRAM_FAILURE, "Link program failure"},
133 {CL_DEVICE_PARTITION_FAILED, "Device partition failed"},
134 {CL_KERNEL_ARG_INFO_NOT_AVAILABLE, "Kernel argument information not available"},
135 {CL_INVALID_VALUE, "Invalid value"},
136 {CL_INVALID_DEVICE_TYPE, "Invalid device type"},
137 {CL_INVALID_PLATFORM, "Invalid platform"},
138 {CL_INVALID_DEVICE, "Invalid device"},
139 {CL_INVALID_CONTEXT, "Invalid context"},
140 {CL_INVALID_QUEUE_PROPERTIES, "Invalid queue properties"},
141 {CL_INVALID_COMMAND_QUEUE, "Invalid command queue"},
142 {CL_INVALID_HOST_PTR, "Invalid host pointer"},
143 {CL_INVALID_MEM_OBJECT, "Invalid memory object"},
144 {CL_INVALID_IMAGE_FORMAT_DESCRIPTOR, "Invalid image format descriptor"},
145 {CL_INVALID_IMAGE_SIZE, "Invalid image size"},
146 {CL_INVALID_SAMPLER, "Invalid sampler"},
147 {CL_INVALID_BINARY, "Invalid binary"},
148 {CL_INVALID_BUILD_OPTIONS, "Invalid build options"},
149 {CL_INVALID_PROGRAM, "Invalid program"},
150 {CL_INVALID_PROGRAM_EXECUTABLE, "Invalid program executable"},
151 {CL_INVALID_KERNEL_NAME, "Invalid kernel name"},
152 {CL_INVALID_KERNEL_DEFINITION, "Invalid kernel definition"},
153 {CL_INVALID_KERNEL, "Invalid kernel"},
154 {CL_INVALID_ARG_INDEX, "Invalid argument index"},
155 {CL_INVALID_ARG_VALUE, "Invalid argument value"},
156 {CL_INVALID_ARG_SIZE, "Invalid argument size"},
157 {CL_INVALID_KERNEL_ARGS, "Invalid kernel arguments"},
158 {CL_INVALID_WORK_DIMENSION, "Invalid work dimension"},
159 {CL_INVALID_WORK_GROUP_SIZE, "Invalid work group size"},
160 {CL_INVALID_WORK_ITEM_SIZE, "Invalid work item size"},
161 {CL_INVALID_GLOBAL_OFFSET, "Invalid global offset"},
162 {CL_INVALID_EVENT_WAIT_LIST, "Invalid event wait list"},
163 {CL_INVALID_EVENT, "Invalid event"},
164 {CL_INVALID_OPERATION, "Invalid operation"},
165 {CL_INVALID_GL_OBJECT, "Invalid GL object"},
166 {CL_INVALID_BUFFER_SIZE, "Invalid buffer size"},
167 {CL_INVALID_MIP_LEVEL, "Invalid mip-level"},
168 {CL_INVALID_GLOBAL_WORK_SIZE, "Invalid global work size"},
169 {CL_INVALID_PROPERTY, "Invalid property"},
170 {CL_INVALID_IMAGE_DESCRIPTOR, "Invalid image descriptor"},
171 {CL_INVALID_COMPILER_OPTIONS, "Invalid compiler options"},
172 {CL_INVALID_LINKER_OPTIONS, "Invalid linker options"},
173 {CL_INVALID_DEVICE_PARTITION_COUNT, "Invalid device partition count"},
174 {CL_INVALID_PIPE_SIZE, "Invalid pipe size"},
175 {CL_INVALID_DEVICE_QUEUE, "Invalid device queue"},
176 {CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR, "Invalid GL share group reference KHR"}};
177
CLErrorCode(cl_int error_code)178 std::string CLErrorCode(cl_int error_code) {
179 auto it = error_infos.find(error_code);
180 if (it == error_infos.end()) {
181 return "Unknown OpenCL error code";
182 } else {
183 return it->second;
184 }
185 }
186
GetBroadcastGpuAxis(int ndim,int ori_axis)187 int GetBroadcastGpuAxis(int ndim, int ori_axis) {
188 if (ori_axis >= ndim) {
189 return ndim - 1;
190 }
191 int axis = 0;
192 if (ndim == DIMENSION_1D) {
193 axis = kNHWC_C;
194 } else if (ndim == DIMENSION_2D) {
195 axis = ori_axis == kNHWC_N ? kNHWC_N : kNHWC_C;
196 } else if (ndim == DIMENSION_3D) {
197 axis = ori_axis == kNHWC_N ? kNHWC_N : ori_axis == kNHWC_H ? kNHWC_W : kNHWC_C;
198 } else if (ndim == DIMENSION_4D) {
199 axis = ori_axis;
200 } else if (ndim > DIMENSION_4D) {
201 MS_LOG(ERROR) << "GPU doesn't support ndim>=" << ndim;
202 }
203 return axis;
204 }
205
206 #ifdef ENABLE_FP16
PackNHWCToNHWC4(void * src,void * dst,bool src_is_fp16,bool dst_is_fp16,const GpuTensorInfo & tensor,int data_type)207 void PackNHWCToNHWC4(void *src, void *dst, bool src_is_fp16, bool dst_is_fp16, const GpuTensorInfo &tensor,
208 int data_type) {
209 MS_ASSERT(src);
210 MS_ASSERT(dst);
211 auto src_fp16 = reinterpret_cast<float16_t *>(src);
212 auto src_fp32 = reinterpret_cast<float32_t *>(src);
213 auto src_int32 = reinterpret_cast<int32_t *>(src);
214 auto dst_fp16 = reinterpret_cast<float16_t *>(dst);
215 auto dst_fp32 = reinterpret_cast<float32_t *>(dst);
216 auto dst_int32 = reinterpret_cast<int32_t *>(dst);
217 for (int n = 0, src_idx = 0; n < tensor.N; n++) {
218 for (int h = 0; h < tensor.D * tensor.H; ++h) {
219 for (int w = 0; w < tensor.W; ++w) {
220 for (int c = 0; c < tensor.C; ++c, ++src_idx) {
221 int dst_idx = ((n * tensor.D * tensor.H + h) * tensor.W + w) * tensor.Slice * C4NUM + c;
222 if (data_type == kNumberTypeInt32) {
223 dst_int32[dst_idx] = src_int32[src_idx];
224 } else if (dst_is_fp16) {
225 dst_fp16[dst_idx] = src_is_fp16 ? src_fp16[src_idx] : static_cast<float16_t>(src_fp32[src_idx]);
226 } else {
227 dst_fp32[dst_idx] = src_is_fp16 ? static_cast<float32_t>(src_fp16[src_idx]) : src_fp32[src_idx];
228 }
229 }
230 }
231 }
232 }
233 // scalar
234 if (tensor.ElementsNum == 1) {
235 if (dst_is_fp16) {
236 dst_fp16[kNHWC_C] = dst_fp16[kNHWC_W] = dst_fp16[kNHWC_H] = dst_fp16[kNHWC_N];
237 } else {
238 dst_fp32[kNHWC_C] = dst_fp32[kNHWC_W] = dst_fp32[kNHWC_H] = dst_fp32[kNHWC_N];
239 }
240 }
241 }
242 #else
PackNHWCToNHWC4(void * src,void * dst,bool src_is_fp16,bool dst_is_fp16,const GpuTensorInfo & tensor,int data_type)243 void PackNHWCToNHWC4(void *src, void *dst, bool src_is_fp16, bool dst_is_fp16, const GpuTensorInfo &tensor,
244 int data_type) {
245 MS_ASSERT(dst);
246 MS_ASSERT(src);
247 auto src_fp32 = reinterpret_cast<float *>(src);
248 auto src_int32 = reinterpret_cast<int32_t *>(src);
249 auto dst_fp32 = reinterpret_cast<float *>(dst);
250 auto dst_int32 = reinterpret_cast<int32_t *>(dst);
251 for (size_t n = 0, src_idx = 0; n < tensor.N; n++) {
252 for (size_t h = 0; h < tensor.D * tensor.H; ++h) {
253 for (size_t w = 0; w < tensor.W; ++w) {
254 for (size_t c = 0; c < tensor.C; ++c, ++src_idx) {
255 int dst_idx = ((n * tensor.D * tensor.H + h) * tensor.W + w) * tensor.Slice * C4NUM + c;
256 if (data_type == kNumberTypeInt32) {
257 dst_int32[dst_idx] = src_int32[src_idx];
258 } else {
259 dst_fp32[dst_idx] = src_fp32[src_idx];
260 }
261 }
262 }
263 }
264 }
265 // scalar
266 if (tensor.ElementsNum == 1) {
267 dst_fp32[kNHWC_C] = dst_fp32[kNHWC_W] = dst_fp32[kNHWC_H] = dst_fp32[kNHWC_N];
268 }
269 }
270 #endif
271
272 #ifdef ENABLE_FP16
PackNCHWToNHWC4(void * src,void * dst,bool src_is_fp16,bool dst_is_fp16,const GpuTensorInfo & tensor,int data_type)273 void PackNCHWToNHWC4(void *src, void *dst, bool src_is_fp16, bool dst_is_fp16, const GpuTensorInfo &tensor,
274 int data_type) {
275 MS_ASSERT(src);
276 MS_ASSERT(dst);
277 auto src_int32 = reinterpret_cast<int32_t *>(src);
278 auto src_fp32 = reinterpret_cast<float32_t *>(src);
279 auto src_fp16 = reinterpret_cast<float16_t *>(src);
280 auto dst_int32 = reinterpret_cast<int32_t *>(dst);
281 auto dst_fp32 = reinterpret_cast<float32_t *>(dst);
282 auto dst_fp16 = reinterpret_cast<float16_t *>(dst);
283 for (int src_idx = 0, n = 0; n < tensor.N; n++) {
284 for (int c = 0; c < tensor.C; ++c) {
285 for (int h = 0; h < tensor.D * tensor.H; ++h) {
286 for (int w = 0; w < tensor.W; ++w, ++src_idx) {
287 int dst_idx = ((n * tensor.D * tensor.H + h) * tensor.W + w) * tensor.Slice * C4NUM + c;
288 if (data_type == kNumberTypeInt32) {
289 dst_int32[dst_idx] = src_int32[src_idx];
290 } else if (dst_is_fp16) {
291 dst_fp16[dst_idx] = src_is_fp16 ? src_fp16[src_idx] : static_cast<float16_t>(src_fp32[src_idx]);
292 } else {
293 dst_fp32[dst_idx] = src_is_fp16 ? static_cast<float32_t>(src_fp16[src_idx]) : src_fp32[src_idx];
294 }
295 }
296 }
297 }
298 }
299 // scalar
300 if (tensor.ElementsNum == 1) {
301 if (dst_is_fp16) {
302 dst_fp16[kNHWC_N] = dst_fp16[kNHWC_H] = dst_fp16[kNHWC_W] = dst_fp16[kNHWC_C];
303 } else {
304 dst_fp32[kNHWC_N] = dst_fp32[kNHWC_H] = dst_fp32[kNHWC_W] = dst_fp32[kNHWC_C];
305 }
306 }
307 }
308 #else
PackNCHWToNHWC4(void * src,void * dst,bool src_is_fp16,bool dst_is_fp16,const GpuTensorInfo & tensor,int data_type)309 void PackNCHWToNHWC4(void *src, void *dst, bool src_is_fp16, bool dst_is_fp16, const GpuTensorInfo &tensor,
310 int data_type) {
311 MS_ASSERT(src);
312 MS_ASSERT(dst);
313 auto src_fp32 = reinterpret_cast<float *>(src);
314 auto src_int32 = reinterpret_cast<int32_t *>(src);
315 auto dst_fp32 = reinterpret_cast<float *>(dst);
316 auto dst_int32 = reinterpret_cast<int32_t *>(dst);
317 for (size_t n = 0, src_idx = 0; n < tensor.N; n++) {
318 for (size_t c = 0; c < tensor.C; ++c) {
319 for (size_t h = 0; h < tensor.D * tensor.H; ++h) {
320 for (size_t w = 0; w < tensor.W; ++w, ++src_idx) {
321 int dst_idx = ((n * tensor.D * tensor.H + h) * tensor.W + w) * tensor.Slice * C4NUM + c;
322 if (data_type == kNumberTypeInt32) {
323 dst_int32[dst_idx] = src_int32[src_idx];
324 } else {
325 dst_fp32[dst_idx] = src_fp32[src_idx];
326 }
327 }
328 }
329 }
330 }
331 // scalar
332 if (tensor.ElementsNum == 1) {
333 dst_fp32[kNHWC_C] = dst_fp32[kNHWC_W] = dst_fp32[kNHWC_H] = dst_fp32[kNHWC_N];
334 }
335 }
336 #endif
337
CheckParamLikeTensor(const std::string & kernel_name,const std::string & tensor_name,lite::Tensor * tensor,TypeId expect_data_type,const std::vector<int> & expect_shape)338 int CheckParamLikeTensor(const std::string &kernel_name, const std::string &tensor_name, lite::Tensor *tensor,
339 TypeId expect_data_type, const std::vector<int> &expect_shape) {
340 if (!tensor->IsConst()) {
341 MS_LOG(WARNING) << "in " << kernel_name << ": tensor " << tensor_name << " must be Const.";
342 return RET_ERROR;
343 }
344 if (tensor->data_type() != expect_data_type) {
345 MS_LOG(WARNING) << "in " << kernel_name << ": tensor's data_type must be " << expect_data_type;
346 return RET_ERROR;
347 }
348 if (tensor->shape() != expect_shape) {
349 std::string expect_shape_str = "(";
350 for (auto i : expect_shape) {
351 expect_shape_str += std::to_string(i) + ",";
352 }
353 expect_shape_str += ")";
354
355 std::string tensor_shape_str = "(";
356 for (auto i : tensor->shape()) {
357 tensor_shape_str += std::to_string(i) + ",";
358 }
359 tensor_shape_str += ")";
360
361 MS_LOG(WARNING) << "in " << kernel_name
362 << ": tensor's shape is error. expect_shape: " + expect_shape_str +
363 " tensor->shape(): " + tensor_shape_str;
364 return RET_ERROR;
365 }
366 return RET_OK;
367 }
368
StoreTensorData(lite::Tensor * tensor)369 void *StoreTensorData(lite::Tensor *tensor) {
370 if ((tensor != nullptr) && (tensor->data() != nullptr) && (tensor->Size() > 0)) {
371 void *stored_data = malloc(tensor->Size());
372 if (stored_data == nullptr) {
373 MS_LOG(ERROR) << "StoreTensorData Malloc Failed.";
374 return nullptr;
375 }
376 memcpy(stored_data, tensor->data(), tensor->Size());
377 return stored_data;
378 }
379 return nullptr;
380 }
381
FreeStoredData(void * data)382 void FreeStoredData(void *data) {
383 if (data != nullptr) {
384 free(data);
385 }
386 }
387
CreateBuildOptionsExtByDType(TypeId type_id)388 std::vector<std::string> CreateBuildOptionsExtByDType(TypeId type_id) {
389 std::vector<std::string> build_options_ext;
390 if (type_id == kNumberTypeInt32) {
391 build_options_ext = {" -DDTYPE=int -DDTYPE4=int4 -DWRITE_IMAGE=write_imagei -DREAD_IMAGE=read_imagei "};
392 } else if (type_id == kNumberTypeFloat32) {
393 build_options_ext = {" -DDTYPE=float -DDTYPE4=float4 -DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef "};
394 } else if (type_id == kNumberTypeFloat16) {
395 build_options_ext = {" -DDTYPE=half -DDTYPE4=half4 -DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh "};
396 }
397 return build_options_ext;
398 }
399 } // namespace mindspore::kernel
400