• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "src/litert/infer_manager.h"
18 #include "src/litert/kernel/opencl/opencl_kernel.h"
19 #include "src/litert/weight_decoder.h"
20 #include "src/common/file_utils.h"
21 
22 using mindspore::lite::RET_ERROR;
23 using mindspore::lite::RET_OK;
24 using mindspore::lite::opencl::ImageSize;
25 
26 namespace mindspore::kernel {
CpuAxis2GpuAxis(size_t ndim,int cpu_axis,int * gpu_axis)27 int CpuAxis2GpuAxis(size_t ndim, int cpu_axis, int *gpu_axis) {
28   static const std::vector<std::vector<int>> kCpuAxis2GpuAxisMapTable = {
29     // For 1D tensor, map the cpu axis [0] to gpu axis [kNHWC_C].
30     {kNHWC_C},
31     // For 2D tensor, map the cpu axis [0, 1] to gpu axis [kNHWC_N, kNHWC_C].
32     {kNHWC_N, kNHWC_C},
33     // For 3D tensor, map the cpu axis [0, 1, 2] to gpu axis [kNHWC_N, kNHWC_W, kNHWC_C].
34     {kNHWC_N, kNHWC_W, kNHWC_C},
35     // For 4D tensor, map the cpu axis [0, 1, 2, 3] to gpu axis [kNHWC_N, kNHWC_H, kNHWC_W, kNHWC_C].
36     {kNHWC_N, kNHWC_H, kNHWC_W, kNHWC_C},
37     // For 5D tensor, map the cpu axis [0, 1, 2, 3, 4] to gpu axis [kNDHWC_N, kNDHWC_D, kNDHWC_H, kNDHWC_W, kNDHWC_C].
38     {kNDHWC_N, kNDHWC_D, kNDHWC_H, kNDHWC_W, kNDHWC_C},
39   };
40   if (gpu_axis == nullptr) {
41     MS_LOG(WARNING) << "Input parameter gpu axis is null";
42     return RET_ERROR;
43   }
44 
45   if ((ndim == 0) || (ndim > kCpuAxis2GpuAxisMapTable.size())) {
46     MS_LOG(WARNING) << "Only support ndim of 1D...5D, bad input ndim: " << ndim;
47     return RET_ERROR;
48   }
49 
50   const auto &axis_map = kCpuAxis2GpuAxisMapTable[ndim - 1];
51   if ((cpu_axis < 0) || (static_cast<size_t>(cpu_axis) >= axis_map.size())) {
52     MS_LOG(WARNING) << "Input cpu axis: " << cpu_axis << " is out of range [0," << axis_map.size() << "]";
53     return RET_ERROR;
54   }
55   *gpu_axis = axis_map[cpu_axis];
56   return RET_OK;
57 }
58 
AlignGlobalLocal(const std::vector<size_t> & global,const std::vector<size_t> & local)59 void OpenCLKernel::AlignGlobalLocal(const std::vector<size_t> &global, const std::vector<size_t> &local) {
60   std::vector<size_t> internal_global_ws = global;
61   for (size_t i = 0; i < local.size(); ++i) {
62     internal_global_ws.at(i) = UP_ROUND(global.at(i), local.at(i));
63   }
64 
65   MS_LOG(DEBUG) << "global size: " << global.size() << ", local size: " << local.size();
66   for (size_t i = 0; i < global.size(); i++) {
67     MS_LOG(DEBUG) << "global[" << i << "] = " << global.at(i);
68   }
69   for (size_t i = 0; i < local.size(); i++) {
70     MS_LOG(DEBUG) << "local[" << i << "] = " << local.at(i);
71   }
72   if (local.empty()) {
73     local_range_ = cl::NullRange;
74   }
75   if (global.size() == 1) {
76     global_range_ = cl::NDRange(internal_global_ws.at(0));
77     if (!local.empty()) {
78       local_range_ = cl::NDRange(local.at(0));
79     }
80   } else if (global.size() == 2) {
81     global_range_ = cl::NDRange(internal_global_ws.at(0), internal_global_ws.at(1));
82     if (!local.empty()) {
83       local_range_ = cl::NDRange(local.at(0), local.at(1));
84     }
85   } else if (global.size() >= 3) {
86     global_range_ = cl::NDRange(internal_global_ws.at(0), internal_global_ws.at(1), internal_global_ws.at(2));
87     if (!local.empty()) {
88       local_range_ = cl::NDRange(local.at(0), local.at(1), local.at(2));
89     }
90   }
91 }
92 
GetImageSize(size_t idx,lite::opencl::ImageSize * img_size)93 int OpenCLKernel::GetImageSize(size_t idx, lite::opencl::ImageSize *img_size) {
94   MS_ASSERT(img_size);
95   if (idx >= out_tensors_.size()) {
96     return RET_ERROR;
97   }
98   auto img_info = GpuTensorInfo::CreateGpuTensorInfo(out_tensors_[idx]);
99   size_t img_dtype = CL_FLOAT;
100   switch (out_tensors_[idx]->data_type()) {
101     case kNumberTypeFloat32: {
102       img_dtype = CL_FLOAT;
103       break;
104     }
105     case kNumberTypeInt32: {
106       img_dtype = CL_SIGNED_INT32;
107       break;
108     }
109     case kNumberTypeFloat16: {
110       img_dtype = CL_HALF_FLOAT;
111       break;
112     }
113     case kNumberTypeInt8: {
114       img_dtype = CL_SIGNED_INT8;
115       break;
116     }
117     default: {
118       MS_LOG(WARNING) << "Unsupported data_type " << out_tensors_[idx]->data_type();
119       return RET_ERROR;
120     }
121   }
122   if(img_info == nullptr){
123     MS_LOG(ERROR) << "img_info is nullptr.";
124     return RET_ERROR;
125   }
126   *img_size = {img_info->width, img_info->height, img_dtype};
127   return RET_OK;
128 }
129 
PrintShape(lite::Tensor * output_tensor)130 void OpenCLKernel::PrintShape(lite::Tensor *output_tensor) {
131   printf("shape=(");
132   auto shape = output_tensor->shape();
133   for (size_t i = 0; i < shape.size(); ++i) {
134     printf("%4d", shape[i]);
135     if (i + 1 < shape.size()) {
136       printf(",");
137     }
138   }
139   printf(") ");
140 }
141 
PrintOutput(int print_num,const std::string & out_file)142 void OpenCLKernel::PrintOutput(int print_num, const std::string &out_file) {
143   printf("%-30s ", name().c_str());
144   if (out_tensors().empty()) {
145     return;
146   }
147   auto *tensor = out_tensors()[0];
148   auto mem_type = GetMemType();
149   if (tensor == nullptr || tensor->data() == nullptr) {
150     return;
151   }
152 
153   auto img_info = GpuTensorInfo::CreateGpuTensorInfo(tensor);
154   if(img_info == nullptr){
155     MS_LOG(ERROR) << "img_info is nullptr.";
156     return;
157   }
158   auto size = mem_type == lite::opencl::MemType::BUF ? img_info->OriginSize : img_info->Image2DSize;
159   std::vector<char> data(size);
160   auto runtime_wrapper = lite::opencl::OpenCLRuntimeInnerWrapper();
161   auto runtime = runtime_wrapper.GetInstance();
162   auto allocator = runtime->GetAllocator();
163   if (!runtime->SyncCommandQueue()) {
164     MS_LOG(ERROR) << "SyncCommandQueue failed.";
165   }
166   if (mem_type == lite::opencl::MemType::BUF) {
167     if (allocator->MapBuffer(tensor->data(), CL_MAP_READ, nullptr, true) == nullptr) {
168       MS_LOG(ERROR) << "Map Buffer failed.";
169     }
170     memcpy(data.data(), tensor->data(), img_info->OriginSize);
171     if (allocator->UnmapBuffer(tensor->data()) != RET_OK) {
172       MS_LOG(ERROR) << "UnmapBuffer failed.";
173     }
174   } else {
175     runtime->ReadImage(tensor->data(), data.data());
176   }
177 
178   PrintShape(tensor);
179 
180   auto total_num = mem_type == lite::opencl::MemType::BUF ? img_info->ElementsNum : img_info->ElementsC4Num;
181   for (int i = 0; i < print_num && i < static_cast<int>(total_num); ++i) {
182 #ifdef ENABLE_FP16
183     if (tensor->data_type() == kNumberTypeInt32) {
184       printf("%d %7d | ", i, reinterpret_cast<int32_t *>(data.data())[i]);
185     } else if (tensor->data_type() == kNumberTypeFloat16) {
186       printf("%d %7.3f | ", i, reinterpret_cast<float16_t *>(data.data())[i]);
187     } else if (tensor->data_type() == kNumberTypeFloat32) {
188       printf("%d %7.3f | ", i, reinterpret_cast<float *>(data.data())[i]);
189     } else if (tensor->data_type() == kNumberTypeInt8) {
190       printf("%d %7d | ", i, static_cast<int>(reinterpret_cast<int8_t *>(data.data())[i]));
191     }
192 #else
193     printf("%d %7.3f | ", i, reinterpret_cast<float *>(data.data())[i]);
194 #endif
195   }
196   printf("\n");
197 
198   if (!out_file.empty()) {
199     (void)lite::WriteToBin(out_file, data.data(), data.size());
200   }
201 }
202 
OpenCLKernelHeader()203 std::string OpenCLKernel::OpenCLKernelHeader() {
204   std::stringstream header;
205   header << "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
206             "__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n"
207             "__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n";
208   if (this->registry_data_type_ == kNumberTypeFloat32) {
209     header << "#define DTYPE float\n"
210               "#define DTYPE4 float4\n"
211               "#define WRITE_IMAGE write_imagef\n"
212               "#define READ_IMAGE read_imagef\n";
213   } else if (this->registry_data_type_ == kNumberTypeFloat16) {
214     header << "#define DTYPE half\n"
215               "#define DTYPE4 half4\n"
216               "#define WRITE_IMAGE write_imageh\n"
217               "#define READ_IMAGE read_imageh\n";
218   } else if (this->registry_data_type_ == kNumberTypeInt32) {
219     header << "#define DTYPE int\n"
220               "#define DTYPE4 int4\n"
221               "#define WRITE_IMAGE write_imagei\n"
222               "#define READ_IMAGE read_imagei\n";
223   } else {
224     MS_LOG(ERROR) << "Unsupported data type: " << this->registry_data_type_;
225     return "";
226   }
227   return header.str();
228 }
229 
MallocDataDone()230 bool OpenCLKernel::MallocDataDone() {
231   for (auto &out_tensor : out_tensors_) {
232     if (out_tensor->data() == nullptr) {
233       return false;
234     }
235     auto allocator = out_tensor->allocator();
236     if (allocator == nullptr) {
237       return false;
238     }
239     lite::opencl::MemType memType;
240     auto buffer = reinterpret_cast<mindspore::lite::opencl::OpenCLAllocator *>(allocator.get())
241                     ->GetOpenclMemPtr(out_tensor->data(), &memType);
242     if ((buffer == nullptr) || (memType != lite::opencl::MemType::IMG)) {
243       return false;
244     }
245   }
246   return true;
247 }
248 
PreProcess()249 int OpenCLKernel::PreProcess() {
250   if (MallocDataDone()) {
251     return RET_OK;
252   }
253   int ret = ReSize();
254   if (ret != RET_OK) {
255     return ret;
256   }
257   for (size_t i = 0; i < out_tensors_.size(); ++i) {
258     auto *output = out_tensors_.at(i);
259     CHECK_NULL_RETURN(output);
260     CHECK_NULL_RETURN(output->allocator());
261     if (GetMemType() == lite::opencl::MemType::IMG) {
262       ImageSize img_size;
263       ret = GetImageSize(i, &img_size);
264       if (ret != RET_OK) {
265         MS_LOG(ERROR) << "GetImageSize failed";
266         return ret;
267       }
268       auto data_ptr =
269         output->allocator()->Malloc(img_size.width, img_size.height, static_cast<enum DataType>(output->data_type()));
270       if (data_ptr == nullptr) {
271         MS_LOG(ERROR) << "Malloc data failed";
272         return RET_ERROR;
273       }
274       output->set_data(data_ptr);
275     } else {
276       ret = output->MallocData();
277       if (ret != RET_OK) {
278         MS_LOG(ERROR) << "MallocData failed";
279         return ret;
280       }
281     }
282     output->ResetRefCount();
283   }
284   is_oversize_kernel_ = ocl_runtime_->GetAllocator()->IsOverSize();
285   return RET_OK;
286 }
287 
InferShape()288 int OpenCLKernel::InferShape() {
289   if (InferShapeDone()) {
290     return RET_OK;
291   }
292   auto ret = lite::KernelInferShape(in_tensors_, out_tensors_, op_parameter_);
293   if (ret != RET_OK) {
294     MS_LOG(WARNING) << "InferShape failed, type: "
295                     << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(type()));
296     return ret;
297   }
298   return RET_OK;
299 }
300 
ReSize()301 int OpenCLKernel::ReSize() {
302   if (InferShapeDone()) {
303     return RET_OK;
304   }
305   auto ret = InferShape();
306   if (ret != RET_OK) {
307     return ret;
308   }
309   ret = CheckSpecs();
310   if (ret != RET_OK) {
311     MS_LOG(ERROR) << "ReSize failed for check kernel specs!";
312     return ret;
313   }
314   ret = Prepare();
315   if (ret != RET_OK) {
316     MS_LOG(ERROR) << "ReSize failed for kernel prepare!";
317     return ret;
318   }
319   return RET_OK;
320 }
321 
GenerateTuningParam()322 std::vector<BaseTuningParameter> OpenCLKernel::GenerateTuningParam() {
323   size_t ndim = global_size_.size();
324   std::vector<BaseTuningParameter> tuning_params = {};
325   if (ndim == 0) {
326     MS_LOG(ERROR) << "Generate tuning param failed, global_size_ is null.";
327     return tuning_params;
328   }
329   BaseTuningParameter default_tuning_param = BaseTuningParameter();
330   default_tuning_param.local_size = local_size_;
331   tuning_params.push_back(default_tuning_param);
332   std::vector<size_t> max_work_items = ocl_runtime_->GetWorkItemSize();
333   size_t max_workgroup_size = ocl_runtime_->GetMaxWorkGroupSize(kernel_);
334   const size_t MIN_WORKGROUP_SIZE = 8;
335   std::set<size_t> candidate_x = GenerateLocalByGlobal(global_size_[0]);
336   std::set<size_t> candidate_y = {1};
337   std::set<size_t> candidate_z = {1};
338   if (ndim > 1) {
339     candidate_y = GenerateLocalByGlobal(global_size_[1]);
340   }
341   if (ndim > 2) {
342     candidate_z = GenerateLocalByGlobal(global_size_[2]);
343   }
344   for (auto x : candidate_x) {
345     if (x <= max_work_items[0]) {
346       for (auto y : candidate_y) {
347         if (y <= max_work_items[1]) {
348           for (auto z : candidate_z) {
349             auto group_size = x * y * z;
350             if (z <= max_work_items[2] && group_size <= max_workgroup_size && group_size >= MIN_WORKGROUP_SIZE) {
351               BaseTuningParameter tuning_param = BaseTuningParameter();
352               tuning_param.local_size = {x, y, z};
353               tuning_params.push_back(tuning_param);
354             }
355           }
356         }
357       }
358     }
359   }
360   return tuning_params;
361 }
362 
AssignTuningParam(const BaseTuningParameter & param)363 int OpenCLKernel::AssignTuningParam(const BaseTuningParameter &param) {
364   std::vector<size_t> local_size_tmp = param.local_size;
365   if (local_size_tmp.size() > global_size_.size()) {
366     local_size_tmp = std::vector<size_t>(local_size_tmp.begin(), local_size_tmp.begin() + global_size_.size());
367   }
368   AlignGlobalLocal(global_size_, local_size_tmp);
369   return RET_OK;
370 }
371 
Tune()372 int OpenCLKernel::Tune() {
373   if (!ocl_runtime_->isProfiling()) {
374     MS_LOG(WARNING) << "Tuning mode require opencl runtime profiling.";
375     return RET_OK;
376   }
377   lite::opencl::TuningMode mode = ocl_runtime_->GetTuningMode();
378   if (mode == lite::opencl::TuningMode::DEFAULT) {
379     return RET_OK;
380   }
381   static const std::set<int> FAST_MODE_OPS = {schema::PrimitiveType_Conv2DFusion,
382                                               schema::PrimitiveType_Conv2dTransposeFusion};
383   if (mode == lite::opencl::TuningMode::FAST && FAST_MODE_OPS.find(op_parameter_->type_) == FAST_MODE_OPS.end()) {
384     return RET_OK;
385   }
386   auto tuning_params = GenerateTuningParam();
387   if (tuning_params.empty()) {
388     MS_LOG(WARNING) << "Tuning param size is 0.";
389     return RET_OK;
390   }
391   int index = -1;
392   double min_time = MAX_PROFILING_TIME_MILLI_SECOND;
393   for (size_t i = 0; i < tuning_params.size(); i++) {
394     AssignTuningParam(tuning_params[i]);
395     auto ret = Run();
396     if (ret != RET_OK) {
397       MS_LOG(ERROR) << "Tuning " << name() << " failed for tuning param " << tuning_params[i];
398       return ret;
399     }
400     double current_time = GetProfilingTimeMs();
401     MS_LOG(DEBUG) << "Tuning " << name() << " param (" << tuning_params[i] << ") exectime " << current_time << "ms";
402     if (current_time < min_time) {
403       min_time = current_time;
404       index = i;
405     }
406   }
407   if (index != -1) {
408     MS_LOG(INFO) << "Tuning " << name() << " result: param (" << tuning_params[index] << ") exectime " << min_time
409                  << "ms";
410     AssignTuningParam(tuning_params[index]);
411   } else {
412     MS_LOG(WARNING) << "Cannot find suitable param.";
413   }
414   return RET_OK;
415 }
416 
GetProfilingTimeMs()417 double OpenCLKernel::GetProfilingTimeMs() {
418   if (!ocl_runtime_->isProfiling()) {
419     return MAX_PROFILING_TIME_MILLI_SECOND;
420   }
421   cl_ulong time_start;
422   cl_ulong time_end;
423   event_.getProfilingInfo(CL_PROFILING_COMMAND_START, &time_start);
424   event_.getProfilingInfo(CL_PROFILING_COMMAND_END, &time_end);
425   cl_ulong time_ns = time_end - time_start;
426   return static_cast<double>(time_ns) * 1e-6;
427 }
428 
GenerateLocalByGlobal(size_t global_i)429 std::set<size_t> OpenCLKernel::GenerateLocalByGlobal(size_t global_i) {
430   std::set<size_t> local_ = {};
431   int index = 1;
432   while (index <= static_cast<int>(global_i)) {
433     local_.insert(index);
434     index *= 2;
435   }
436   for (size_t i = 1; i <= 16; i++) {
437     if (global_i % i == 0) {
438       local_.insert(i);
439     }
440   }
441   return local_;
442 }
443 
CheckSpecs()444 int OpenCLKernel::CheckSpecs() {
445   if (out_mem_type_ == lite::opencl::MemType::IMG) {
446     if (!GpuTensorInfo::CreateGpuTensorInfo(out_tensors_[0])->IsImageSizeValid()) {
447       return RET_ERROR;
448     }
449   }
450   if (in_tensors_.size() > 0) {
451     if (in_tensors_[0]->data_type() != kNumberTypeFloat32 && in_tensors_[0]->data_type() != kNumberTypeFloat16 &&
452         in_tensors_[0]->data_type() != kNumberTypeInt32 && in_tensors_[0]->data_type() != kNumberTypeInt8) {
453       MS_LOG(WARNING) << "Unsupported data type: " << in_tensors_[0]->data_type();
454       return RET_ERROR;
455     }
456   }
457   return RET_OK;
458 }
459 }  // namespace mindspore::kernel
460