1 /**
2 * Copyright 2020 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "src/litert/infer_manager.h"
18 #include "src/litert/kernel/opencl/opencl_kernel.h"
19 #include "src/litert/weight_decoder.h"
20 #include "src/common/file_utils.h"
21
22 using mindspore::lite::RET_ERROR;
23 using mindspore::lite::RET_OK;
24 using mindspore::lite::opencl::ImageSize;
25
26 namespace mindspore::kernel {
CpuAxis2GpuAxis(size_t ndim,int cpu_axis,int * gpu_axis)27 int CpuAxis2GpuAxis(size_t ndim, int cpu_axis, int *gpu_axis) {
28 static const std::vector<std::vector<int>> kCpuAxis2GpuAxisMapTable = {
29 // For 1D tensor, map the cpu axis [0] to gpu axis [kNHWC_C].
30 {kNHWC_C},
31 // For 2D tensor, map the cpu axis [0, 1] to gpu axis [kNHWC_N, kNHWC_C].
32 {kNHWC_N, kNHWC_C},
33 // For 3D tensor, map the cpu axis [0, 1, 2] to gpu axis [kNHWC_N, kNHWC_W, kNHWC_C].
34 {kNHWC_N, kNHWC_W, kNHWC_C},
35 // For 4D tensor, map the cpu axis [0, 1, 2, 3] to gpu axis [kNHWC_N, kNHWC_H, kNHWC_W, kNHWC_C].
36 {kNHWC_N, kNHWC_H, kNHWC_W, kNHWC_C},
37 // For 5D tensor, map the cpu axis [0, 1, 2, 3, 4] to gpu axis [kNDHWC_N, kNDHWC_D, kNDHWC_H, kNDHWC_W, kNDHWC_C].
38 {kNDHWC_N, kNDHWC_D, kNDHWC_H, kNDHWC_W, kNDHWC_C},
39 };
40 if (gpu_axis == nullptr) {
41 MS_LOG(WARNING) << "Input parameter gpu axis is null";
42 return RET_ERROR;
43 }
44
45 if ((ndim == 0) || (ndim > kCpuAxis2GpuAxisMapTable.size())) {
46 MS_LOG(WARNING) << "Only support ndim of 1D...5D, bad input ndim: " << ndim;
47 return RET_ERROR;
48 }
49
50 const auto &axis_map = kCpuAxis2GpuAxisMapTable[ndim - 1];
51 if ((cpu_axis < 0) || (static_cast<size_t>(cpu_axis) >= axis_map.size())) {
52 MS_LOG(WARNING) << "Input cpu axis: " << cpu_axis << " is out of range [0," << axis_map.size() << "]";
53 return RET_ERROR;
54 }
55 *gpu_axis = axis_map[cpu_axis];
56 return RET_OK;
57 }
58
AlignGlobalLocal(const std::vector<size_t> & global,const std::vector<size_t> & local)59 void OpenCLKernel::AlignGlobalLocal(const std::vector<size_t> &global, const std::vector<size_t> &local) {
60 std::vector<size_t> internal_global_ws = global;
61 for (size_t i = 0; i < local.size(); ++i) {
62 internal_global_ws.at(i) = UP_ROUND(global.at(i), local.at(i));
63 }
64
65 MS_LOG(DEBUG) << "global size: " << global.size() << ", local size: " << local.size();
66 for (size_t i = 0; i < global.size(); i++) {
67 MS_LOG(DEBUG) << "global[" << i << "] = " << global.at(i);
68 }
69 for (size_t i = 0; i < local.size(); i++) {
70 MS_LOG(DEBUG) << "local[" << i << "] = " << local.at(i);
71 }
72 if (local.empty()) {
73 local_range_ = cl::NullRange;
74 }
75 if (global.size() == 1) {
76 global_range_ = cl::NDRange(internal_global_ws.at(0));
77 if (!local.empty()) {
78 local_range_ = cl::NDRange(local.at(0));
79 }
80 } else if (global.size() == 2) {
81 global_range_ = cl::NDRange(internal_global_ws.at(0), internal_global_ws.at(1));
82 if (!local.empty()) {
83 local_range_ = cl::NDRange(local.at(0), local.at(1));
84 }
85 } else if (global.size() >= 3) {
86 global_range_ = cl::NDRange(internal_global_ws.at(0), internal_global_ws.at(1), internal_global_ws.at(2));
87 if (!local.empty()) {
88 local_range_ = cl::NDRange(local.at(0), local.at(1), local.at(2));
89 }
90 }
91 }
92
GetImageSize(size_t idx,lite::opencl::ImageSize * img_size)93 int OpenCLKernel::GetImageSize(size_t idx, lite::opencl::ImageSize *img_size) {
94 MS_ASSERT(img_size);
95 if (idx >= out_tensors_.size()) {
96 return RET_ERROR;
97 }
98 auto img_info = GpuTensorInfo::CreateGpuTensorInfo(out_tensors_[idx]);
99 size_t img_dtype = CL_FLOAT;
100 switch (out_tensors_[idx]->data_type()) {
101 case kNumberTypeFloat32: {
102 img_dtype = CL_FLOAT;
103 break;
104 }
105 case kNumberTypeInt32: {
106 img_dtype = CL_SIGNED_INT32;
107 break;
108 }
109 case kNumberTypeFloat16: {
110 img_dtype = CL_HALF_FLOAT;
111 break;
112 }
113 case kNumberTypeInt8: {
114 img_dtype = CL_SIGNED_INT8;
115 break;
116 }
117 default: {
118 MS_LOG(WARNING) << "Unsupported data_type " << out_tensors_[idx]->data_type();
119 return RET_ERROR;
120 }
121 }
122 if(img_info == nullptr){
123 MS_LOG(ERROR) << "img_info is nullptr.";
124 return RET_ERROR;
125 }
126 *img_size = {img_info->width, img_info->height, img_dtype};
127 return RET_OK;
128 }
129
PrintShape(lite::Tensor * output_tensor)130 void OpenCLKernel::PrintShape(lite::Tensor *output_tensor) {
131 printf("shape=(");
132 auto shape = output_tensor->shape();
133 for (size_t i = 0; i < shape.size(); ++i) {
134 printf("%4d", shape[i]);
135 if (i + 1 < shape.size()) {
136 printf(",");
137 }
138 }
139 printf(") ");
140 }
141
PrintOutput(int print_num,const std::string & out_file)142 void OpenCLKernel::PrintOutput(int print_num, const std::string &out_file) {
143 printf("%-30s ", name().c_str());
144 if (out_tensors().empty()) {
145 return;
146 }
147 auto *tensor = out_tensors()[0];
148 auto mem_type = GetMemType();
149 if (tensor == nullptr || tensor->data() == nullptr) {
150 return;
151 }
152
153 auto img_info = GpuTensorInfo::CreateGpuTensorInfo(tensor);
154 if(img_info == nullptr){
155 MS_LOG(ERROR) << "img_info is nullptr.";
156 return;
157 }
158 auto size = mem_type == lite::opencl::MemType::BUF ? img_info->OriginSize : img_info->Image2DSize;
159 std::vector<char> data(size);
160 auto runtime_wrapper = lite::opencl::OpenCLRuntimeInnerWrapper();
161 auto runtime = runtime_wrapper.GetInstance();
162 auto allocator = runtime->GetAllocator();
163 if (!runtime->SyncCommandQueue()) {
164 MS_LOG(ERROR) << "SyncCommandQueue failed.";
165 }
166 if (mem_type == lite::opencl::MemType::BUF) {
167 if (allocator->MapBuffer(tensor->data(), CL_MAP_READ, nullptr, true) == nullptr) {
168 MS_LOG(ERROR) << "Map Buffer failed.";
169 }
170 memcpy(data.data(), tensor->data(), img_info->OriginSize);
171 if (allocator->UnmapBuffer(tensor->data()) != RET_OK) {
172 MS_LOG(ERROR) << "UnmapBuffer failed.";
173 }
174 } else {
175 runtime->ReadImage(tensor->data(), data.data());
176 }
177
178 PrintShape(tensor);
179
180 auto total_num = mem_type == lite::opencl::MemType::BUF ? img_info->ElementsNum : img_info->ElementsC4Num;
181 for (int i = 0; i < print_num && i < static_cast<int>(total_num); ++i) {
182 #ifdef ENABLE_FP16
183 if (tensor->data_type() == kNumberTypeInt32) {
184 printf("%d %7d | ", i, reinterpret_cast<int32_t *>(data.data())[i]);
185 } else if (tensor->data_type() == kNumberTypeFloat16) {
186 printf("%d %7.3f | ", i, reinterpret_cast<float16_t *>(data.data())[i]);
187 } else if (tensor->data_type() == kNumberTypeFloat32) {
188 printf("%d %7.3f | ", i, reinterpret_cast<float *>(data.data())[i]);
189 } else if (tensor->data_type() == kNumberTypeInt8) {
190 printf("%d %7d | ", i, static_cast<int>(reinterpret_cast<int8_t *>(data.data())[i]));
191 }
192 #else
193 printf("%d %7.3f | ", i, reinterpret_cast<float *>(data.data())[i]);
194 #endif
195 }
196 printf("\n");
197
198 if (!out_file.empty()) {
199 (void)lite::WriteToBin(out_file, data.data(), data.size());
200 }
201 }
202
OpenCLKernelHeader()203 std::string OpenCLKernel::OpenCLKernelHeader() {
204 std::stringstream header;
205 header << "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
206 "__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n"
207 "__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n";
208 if (this->registry_data_type_ == kNumberTypeFloat32) {
209 header << "#define DTYPE float\n"
210 "#define DTYPE4 float4\n"
211 "#define WRITE_IMAGE write_imagef\n"
212 "#define READ_IMAGE read_imagef\n";
213 } else if (this->registry_data_type_ == kNumberTypeFloat16) {
214 header << "#define DTYPE half\n"
215 "#define DTYPE4 half4\n"
216 "#define WRITE_IMAGE write_imageh\n"
217 "#define READ_IMAGE read_imageh\n";
218 } else if (this->registry_data_type_ == kNumberTypeInt32) {
219 header << "#define DTYPE int\n"
220 "#define DTYPE4 int4\n"
221 "#define WRITE_IMAGE write_imagei\n"
222 "#define READ_IMAGE read_imagei\n";
223 } else {
224 MS_LOG(ERROR) << "Unsupported data type: " << this->registry_data_type_;
225 return "";
226 }
227 return header.str();
228 }
229
MallocDataDone()230 bool OpenCLKernel::MallocDataDone() {
231 for (auto &out_tensor : out_tensors_) {
232 if (out_tensor->data() == nullptr) {
233 return false;
234 }
235 auto allocator = out_tensor->allocator();
236 if (allocator == nullptr) {
237 return false;
238 }
239 lite::opencl::MemType memType;
240 auto buffer = reinterpret_cast<mindspore::lite::opencl::OpenCLAllocator *>(allocator.get())
241 ->GetOpenclMemPtr(out_tensor->data(), &memType);
242 if ((buffer == nullptr) || (memType != lite::opencl::MemType::IMG)) {
243 return false;
244 }
245 }
246 return true;
247 }
248
PreProcess()249 int OpenCLKernel::PreProcess() {
250 if (MallocDataDone()) {
251 return RET_OK;
252 }
253 int ret = ReSize();
254 if (ret != RET_OK) {
255 return ret;
256 }
257 for (size_t i = 0; i < out_tensors_.size(); ++i) {
258 auto *output = out_tensors_.at(i);
259 CHECK_NULL_RETURN(output);
260 CHECK_NULL_RETURN(output->allocator());
261 if (GetMemType() == lite::opencl::MemType::IMG) {
262 ImageSize img_size;
263 ret = GetImageSize(i, &img_size);
264 if (ret != RET_OK) {
265 MS_LOG(ERROR) << "GetImageSize failed";
266 return ret;
267 }
268 auto data_ptr =
269 output->allocator()->Malloc(img_size.width, img_size.height, static_cast<enum DataType>(output->data_type()));
270 if (data_ptr == nullptr) {
271 MS_LOG(ERROR) << "Malloc data failed";
272 return RET_ERROR;
273 }
274 output->set_data(data_ptr);
275 } else {
276 ret = output->MallocData();
277 if (ret != RET_OK) {
278 MS_LOG(ERROR) << "MallocData failed";
279 return ret;
280 }
281 }
282 output->ResetRefCount();
283 }
284 is_oversize_kernel_ = ocl_runtime_->GetAllocator()->IsOverSize();
285 return RET_OK;
286 }
287
InferShape()288 int OpenCLKernel::InferShape() {
289 if (InferShapeDone()) {
290 return RET_OK;
291 }
292 auto ret = lite::KernelInferShape(in_tensors_, out_tensors_, op_parameter_);
293 if (ret != RET_OK) {
294 MS_LOG(WARNING) << "InferShape failed, type: "
295 << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(type()));
296 return ret;
297 }
298 return RET_OK;
299 }
300
ReSize()301 int OpenCLKernel::ReSize() {
302 if (InferShapeDone()) {
303 return RET_OK;
304 }
305 auto ret = InferShape();
306 if (ret != RET_OK) {
307 return ret;
308 }
309 ret = CheckSpecs();
310 if (ret != RET_OK) {
311 MS_LOG(ERROR) << "ReSize failed for check kernel specs!";
312 return ret;
313 }
314 ret = Prepare();
315 if (ret != RET_OK) {
316 MS_LOG(ERROR) << "ReSize failed for kernel prepare!";
317 return ret;
318 }
319 return RET_OK;
320 }
321
GenerateTuningParam()322 std::vector<BaseTuningParameter> OpenCLKernel::GenerateTuningParam() {
323 size_t ndim = global_size_.size();
324 std::vector<BaseTuningParameter> tuning_params = {};
325 if (ndim == 0) {
326 MS_LOG(ERROR) << "Generate tuning param failed, global_size_ is null.";
327 return tuning_params;
328 }
329 BaseTuningParameter default_tuning_param = BaseTuningParameter();
330 default_tuning_param.local_size = local_size_;
331 tuning_params.push_back(default_tuning_param);
332 std::vector<size_t> max_work_items = ocl_runtime_->GetWorkItemSize();
333 size_t max_workgroup_size = ocl_runtime_->GetMaxWorkGroupSize(kernel_);
334 const size_t MIN_WORKGROUP_SIZE = 8;
335 std::set<size_t> candidate_x = GenerateLocalByGlobal(global_size_[0]);
336 std::set<size_t> candidate_y = {1};
337 std::set<size_t> candidate_z = {1};
338 if (ndim > 1) {
339 candidate_y = GenerateLocalByGlobal(global_size_[1]);
340 }
341 if (ndim > 2) {
342 candidate_z = GenerateLocalByGlobal(global_size_[2]);
343 }
344 for (auto x : candidate_x) {
345 if (x <= max_work_items[0]) {
346 for (auto y : candidate_y) {
347 if (y <= max_work_items[1]) {
348 for (auto z : candidate_z) {
349 auto group_size = x * y * z;
350 if (z <= max_work_items[2] && group_size <= max_workgroup_size && group_size >= MIN_WORKGROUP_SIZE) {
351 BaseTuningParameter tuning_param = BaseTuningParameter();
352 tuning_param.local_size = {x, y, z};
353 tuning_params.push_back(tuning_param);
354 }
355 }
356 }
357 }
358 }
359 }
360 return tuning_params;
361 }
362
AssignTuningParam(const BaseTuningParameter & param)363 int OpenCLKernel::AssignTuningParam(const BaseTuningParameter ¶m) {
364 std::vector<size_t> local_size_tmp = param.local_size;
365 if (local_size_tmp.size() > global_size_.size()) {
366 local_size_tmp = std::vector<size_t>(local_size_tmp.begin(), local_size_tmp.begin() + global_size_.size());
367 }
368 AlignGlobalLocal(global_size_, local_size_tmp);
369 return RET_OK;
370 }
371
Tune()372 int OpenCLKernel::Tune() {
373 if (!ocl_runtime_->isProfiling()) {
374 MS_LOG(WARNING) << "Tuning mode require opencl runtime profiling.";
375 return RET_OK;
376 }
377 lite::opencl::TuningMode mode = ocl_runtime_->GetTuningMode();
378 if (mode == lite::opencl::TuningMode::DEFAULT) {
379 return RET_OK;
380 }
381 static const std::set<int> FAST_MODE_OPS = {schema::PrimitiveType_Conv2DFusion,
382 schema::PrimitiveType_Conv2dTransposeFusion};
383 if (mode == lite::opencl::TuningMode::FAST && FAST_MODE_OPS.find(op_parameter_->type_) == FAST_MODE_OPS.end()) {
384 return RET_OK;
385 }
386 auto tuning_params = GenerateTuningParam();
387 if (tuning_params.empty()) {
388 MS_LOG(WARNING) << "Tuning param size is 0.";
389 return RET_OK;
390 }
391 int index = -1;
392 double min_time = MAX_PROFILING_TIME_MILLI_SECOND;
393 for (size_t i = 0; i < tuning_params.size(); i++) {
394 AssignTuningParam(tuning_params[i]);
395 auto ret = Run();
396 if (ret != RET_OK) {
397 MS_LOG(ERROR) << "Tuning " << name() << " failed for tuning param " << tuning_params[i];
398 return ret;
399 }
400 double current_time = GetProfilingTimeMs();
401 MS_LOG(DEBUG) << "Tuning " << name() << " param (" << tuning_params[i] << ") exectime " << current_time << "ms";
402 if (current_time < min_time) {
403 min_time = current_time;
404 index = i;
405 }
406 }
407 if (index != -1) {
408 MS_LOG(INFO) << "Tuning " << name() << " result: param (" << tuning_params[index] << ") exectime " << min_time
409 << "ms";
410 AssignTuningParam(tuning_params[index]);
411 } else {
412 MS_LOG(WARNING) << "Cannot find suitable param.";
413 }
414 return RET_OK;
415 }
416
GetProfilingTimeMs()417 double OpenCLKernel::GetProfilingTimeMs() {
418 if (!ocl_runtime_->isProfiling()) {
419 return MAX_PROFILING_TIME_MILLI_SECOND;
420 }
421 cl_ulong time_start;
422 cl_ulong time_end;
423 event_.getProfilingInfo(CL_PROFILING_COMMAND_START, &time_start);
424 event_.getProfilingInfo(CL_PROFILING_COMMAND_END, &time_end);
425 cl_ulong time_ns = time_end - time_start;
426 return static_cast<double>(time_ns) * 1e-6;
427 }
428
GenerateLocalByGlobal(size_t global_i)429 std::set<size_t> OpenCLKernel::GenerateLocalByGlobal(size_t global_i) {
430 std::set<size_t> local_ = {};
431 int index = 1;
432 while (index <= static_cast<int>(global_i)) {
433 local_.insert(index);
434 index *= 2;
435 }
436 for (size_t i = 1; i <= 16; i++) {
437 if (global_i % i == 0) {
438 local_.insert(i);
439 }
440 }
441 return local_;
442 }
443
CheckSpecs()444 int OpenCLKernel::CheckSpecs() {
445 if (out_mem_type_ == lite::opencl::MemType::IMG) {
446 if (!GpuTensorInfo::CreateGpuTensorInfo(out_tensors_[0])->IsImageSizeValid()) {
447 return RET_ERROR;
448 }
449 }
450 if (in_tensors_.size() > 0) {
451 if (in_tensors_[0]->data_type() != kNumberTypeFloat32 && in_tensors_[0]->data_type() != kNumberTypeFloat16 &&
452 in_tensors_[0]->data_type() != kNumberTypeInt32 && in_tensors_[0]->data_type() != kNumberTypeInt8) {
453 MS_LOG(WARNING) << "Unsupported data type: " << in_tensors_[0]->data_type();
454 return RET_ERROR;
455 }
456 }
457 return RET_OK;
458 }
459 } // namespace mindspore::kernel
460