• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "src/runtime/infer_manager.h"
18 #include "src/runtime/kernel/opencl/opencl_kernel.h"
19 #include "src/weight_decoder.h"
20 #include "src/common/file_utils.h"
21 
22 using mindspore::lite::RET_ERROR;
23 using mindspore::lite::RET_OK;
24 using mindspore::lite::opencl::ImageSize;
25 
26 namespace mindspore::kernel {
AlignGlobalLocal(const std::vector<size_t> & global,const std::vector<size_t> & local)27 void OpenCLKernel::AlignGlobalLocal(const std::vector<size_t> &global, const std::vector<size_t> &local) {
28   std::vector<size_t> internal_global_ws = global;
29   for (size_t i = 0; i < local.size(); ++i) {
30     internal_global_ws.at(i) = UP_ROUND(global.at(i), local.at(i));
31   }
32 
33   MS_LOG(DEBUG) << "global size: " << global.size() << ", local size: " << local.size();
34   for (size_t i = 0; i < global.size(); i++) {
35     MS_LOG(DEBUG) << "global[" << i << "] = " << global.at(i);
36   }
37   for (size_t i = 0; i < local.size(); i++) {
38     MS_LOG(DEBUG) << "local[" << i << "] = " << local.at(i);
39   }
40   if (local.empty()) {
41     local_range_ = cl::NullRange;
42   }
43   if (global.size() == 1) {
44     global_range_ = cl::NDRange(internal_global_ws.at(0));
45     if (!local.empty()) {
46       local_range_ = cl::NDRange(local.at(0));
47     }
48   } else if (global.size() == 2) {
49     global_range_ = cl::NDRange(internal_global_ws.at(0), internal_global_ws.at(1));
50     if (!local.empty()) {
51       local_range_ = cl::NDRange(local.at(0), local.at(1));
52     }
53   } else if (global.size() >= 3) {
54     global_range_ = cl::NDRange(internal_global_ws.at(0), internal_global_ws.at(1), internal_global_ws.at(2));
55     if (!local.empty()) {
56       local_range_ = cl::NDRange(local.at(0), local.at(1), local.at(2));
57     }
58   }
59 }
60 
GetImageSize(size_t idx,lite::opencl::ImageSize * img_size)61 int OpenCLKernel::GetImageSize(size_t idx, lite::opencl::ImageSize *img_size) {
62   MS_ASSERT(img_size);
63   if (idx >= out_tensors_.size()) {
64     return RET_ERROR;
65   }
66   auto img_info = GpuTensorInfo(out_tensors_[idx]);
67   size_t img_dtype = CL_FLOAT;
68   switch (out_tensors_[idx]->data_type()) {
69     case kNumberTypeFloat32: {
70       img_dtype = CL_FLOAT;
71       break;
72     }
73     case kNumberTypeInt32: {
74       img_dtype = CL_SIGNED_INT32;
75       break;
76     }
77     case kNumberTypeFloat16: {
78       img_dtype = CL_HALF_FLOAT;
79       break;
80     }
81     case kNumberTypeInt8: {
82       img_dtype = CL_SIGNED_INT8;
83       break;
84     }
85     default: {
86       MS_LOG(WARNING) << "Unsupported data_type " << out_tensors_[idx]->data_type();
87       return RET_ERROR;
88     }
89   }
90   *img_size = {img_info.width, img_info.height, img_dtype};
91   return RET_OK;
92 }
93 
PrintOutput(int print_num,const std::string & out_file)94 void OpenCLKernel::PrintOutput(int print_num, const std::string &out_file) {
95   printf("%-30s ", name().c_str());
96   if (out_tensors().empty()) {
97     return;
98   }
99   auto *tensor = out_tensors()[0];
100   auto mem_type = GetMemType();
101   if (tensor == nullptr || tensor->data() == nullptr) {
102     return;
103   }
104 
105   GpuTensorInfo img_info(tensor);
106   auto size = mem_type == lite::opencl::MemType::BUF ? img_info.OriginSize : img_info.Image2DSize;
107   std::vector<char> data(size);
108   auto runtime_wrapper = lite::opencl::OpenCLRuntimeInnerWrapper();
109   auto runtime = runtime_wrapper.GetInstance();
110   auto allocator = runtime->GetAllocator();
111   if (!runtime->SyncCommandQueue()) {
112     MS_LOG(ERROR) << "SyncCommandQueue failed.";
113   }
114   if (mem_type == lite::opencl::MemType::BUF) {
115     if (allocator->MapBuffer(tensor->data(), CL_MAP_READ, nullptr, true) == nullptr) {
116       MS_LOG(ERROR) << "Map Buffer failed.";
117     }
118     memcpy(data.data(), tensor->data(), img_info.OriginSize);
119     if (allocator->UnmapBuffer(tensor->data()) != RET_OK) {
120       MS_LOG(ERROR) << "UnmapBuffer failed.";
121     }
122   } else {
123     runtime->ReadImage(tensor->data(), data.data());
124   }
125 
126   printf("shape=(");
127   auto shape = tensor->shape();
128   for (int i = 0; i < shape.size(); ++i) {
129     printf("%4d", shape[i]);
130     if (i + 1 < shape.size()) {
131       printf(",");
132     }
133   }
134   printf(") ");
135 
136   auto total_num = mem_type == lite::opencl::MemType::BUF ? img_info.ElementsNum : img_info.ElementsC4Num;
137   for (int i = 0; i < print_num && i < total_num; ++i) {
138     if (tensor->data_type() == kNumberTypeInt32) {
139       printf("%d %7d | ", i, reinterpret_cast<int32_t *>(data.data())[i]);
140     } else if (tensor->data_type() == kNumberTypeFloat16) {
141       printf("%d %7.3f | ", i, reinterpret_cast<float16_t *>(data.data())[i]);
142     } else if (tensor->data_type() == kNumberTypeFloat32) {
143       printf("%d %7.3f | ", i, reinterpret_cast<float *>(data.data())[i]);
144     } else if (tensor->data_type() == kNumberTypeInt8) {
145       printf("%d %7d | ", i, static_cast<int>(reinterpret_cast<int8_t *>(data.data())[i]));
146     }
147   }
148   printf("\n");
149 
150   if (!out_file.empty()) {
151     (void)lite::WriteToBin(out_file, data.data(), data.size());
152   }
153 }
154 
PreProcess()155 int OpenCLKernel::PreProcess() {
156   int ret = RET_OK;
157   ret = ReSize();
158   if (ret != RET_OK) {
159     return ret;
160   }
161   for (auto i = 0; i < out_tensors_.size(); ++i) {
162     auto *output = out_tensors_.at(i);
163     CHECK_NULL_RETURN(output);
164     CHECK_NULL_RETURN(output->allocator());
165     if (GetMemType() == lite::opencl::MemType::IMG) {
166       ImageSize img_size;
167       ret = GetImageSize(i, &img_size);
168       if (ret != RET_OK) {
169         MS_LOG(ERROR) << "GetImageSize failed";
170         return ret;
171       }
172       auto data_ptr =
173         output->allocator()->Malloc(img_size.width, img_size.height, static_cast<enum DataType>(output->data_type()));
174       if (data_ptr == nullptr) {
175         MS_LOG(ERROR) << "Malloc data failed";
176         return RET_ERROR;
177       }
178       output->set_data(data_ptr);
179     } else {
180       ret = output->MallocData();
181       if (ret != RET_OK) {
182         MS_LOG(ERROR) << "MallocData failed";
183         return ret;
184       }
185     }
186     output->ResetRefCount();
187   }
188   return RET_OK;
189 }
190 
InferShape()191 int OpenCLKernel::InferShape() {
192   if (InferShapeDone()) {
193     return RET_OK;
194   }
195   auto ret = lite::KernelInferShape(in_tensors_, out_tensors_, op_parameter_);
196   if (ret != RET_OK) {
197     MS_LOG(ERROR) << "InferShape failed, type: "
198                   << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(type()));
199     return ret;
200   }
201   return RET_OK;
202 }
203 
ReSize()204 int OpenCLKernel::ReSize() {
205   if (InferShapeDone()) {
206     return RET_OK;
207   }
208   auto ret = InferShape();
209   if (ret != RET_OK) {
210     return ret;
211   }
212   ret = CheckSpecs();
213   if (ret != RET_OK) {
214     MS_LOG(ERROR) << "ReSize failed for check kernel specs!";
215     return ret;
216   }
217   ret = Prepare();
218   if (ret != RET_OK) {
219     MS_LOG(ERROR) << "ReSize failed for kernel prepare!";
220     return ret;
221   }
222   return RET_OK;
223 }
224 
GenerateTuningParam()225 std::vector<BaseTuningParameter> OpenCLKernel::GenerateTuningParam() {
226   size_t ndim = global_size_.size();
227   std::vector<BaseTuningParameter> tuning_params = {};
228   if (ndim == 0) {
229     MS_LOG(ERROR) << "Generate tuning param failed, global_size_ is null.";
230     return tuning_params;
231   }
232   BaseTuningParameter default_tuning_param = BaseTuningParameter();
233   default_tuning_param.local_size = local_size_;
234   tuning_params.push_back(default_tuning_param);
235   std::vector<size_t> max_work_items = ocl_runtime_->GetWorkItemSize();
236   size_t max_workgroup_size = ocl_runtime_->GetMaxWorkGroupSize(kernel_);
237   const size_t MIN_WORKGROUP_SIZE = 8;
238   std::set<size_t> candidate_x = GenerateLocalByGlobal(global_size_[0]);
239   std::set<size_t> candidate_y = {1};
240   std::set<size_t> candidate_z = {1};
241   if (ndim > 1) {
242     candidate_y = GenerateLocalByGlobal(global_size_[1]);
243   }
244   if (ndim > 2) {
245     candidate_z = GenerateLocalByGlobal(global_size_[2]);
246   }
247   for (auto x : candidate_x) {
248     if (x <= max_work_items[0]) {
249       for (auto y : candidate_y) {
250         if (y <= max_work_items[1]) {
251           for (auto z : candidate_z) {
252             auto group_size = x * y * z;
253             if (z <= max_work_items[2] && group_size <= max_workgroup_size && group_size >= MIN_WORKGROUP_SIZE) {
254               BaseTuningParameter tuning_param = BaseTuningParameter();
255               tuning_param.local_size = {x, y, z};
256               tuning_params.push_back(tuning_param);
257             }
258           }
259         }
260       }
261     }
262   }
263   return tuning_params;
264 }
265 
AssignTuningParam(const BaseTuningParameter & param)266 int OpenCLKernel::AssignTuningParam(const BaseTuningParameter &param) {
267   std::vector<size_t> local_size_tmp = param.local_size;
268   if (local_size_tmp.size() > global_size_.size()) {
269     local_size_tmp = std::vector<size_t>(local_size_tmp.begin(), local_size_tmp.begin() + global_size_.size());
270   }
271   AlignGlobalLocal(global_size_, local_size_tmp);
272   return RET_OK;
273 }
274 
Tune()275 int OpenCLKernel::Tune() {
276   if (!ocl_runtime_->isProfiling()) {
277     MS_LOG(WARNING) << "Tuning mode require opencl runtime profiling.";
278     return RET_OK;
279   }
280   lite::opencl::TuningMode mode = ocl_runtime_->GetTuningMode();
281   if (mode == lite::opencl::TuningMode::DEFAULT) {
282     return RET_OK;
283   }
284   static const std::set<int> FAST_MODE_OPS = {schema::PrimitiveType_Conv2DFusion,
285                                               schema::PrimitiveType_Conv2dTransposeFusion};
286   if (mode == lite::opencl::TuningMode::FAST && FAST_MODE_OPS.find(op_parameter_->type_) == FAST_MODE_OPS.end()) {
287     return RET_OK;
288   }
289   auto tuning_params = GenerateTuningParam();
290   if (tuning_params.empty()) {
291     MS_LOG(WARNING) << "Tuning param size is 0.";
292     return RET_OK;
293   }
294   int index = -1;
295   double min_time = MAX_PROFILING_TIME_MILLI_SECOND;
296   for (int i = 0; i < tuning_params.size(); i++) {
297     AssignTuningParam(tuning_params[i]);
298     auto ret = Run();
299     if (ret != RET_OK) {
300       MS_LOG(ERROR) << "Tuning " << name() << " failed for tuning param " << tuning_params[i];
301       return ret;
302     }
303     double current_time = GetProfilingTimeMs();
304     MS_LOG(DEBUG) << "Tuning " << name() << " param (" << tuning_params[i] << ") exectime " << current_time << "ms";
305     if (current_time < min_time) {
306       min_time = current_time;
307       index = i;
308     }
309   }
310   if (index != -1) {
311     MS_LOG(INFO) << "Tuning " << name() << " result: param (" << tuning_params[index] << ") exectime " << min_time
312                  << "ms";
313     AssignTuningParam(tuning_params[index]);
314   } else {
315     MS_LOG(WARNING) << "Cannot find suitable param.";
316   }
317   return RET_OK;
318 }
319 
GetProfilingTimeMs()320 double OpenCLKernel::GetProfilingTimeMs() {
321   if (!ocl_runtime_->isProfiling()) {
322     return MAX_PROFILING_TIME_MILLI_SECOND;
323   }
324   cl_ulong time_start;
325   cl_ulong time_end;
326   event_.getProfilingInfo(CL_PROFILING_COMMAND_START, &time_start);
327   event_.getProfilingInfo(CL_PROFILING_COMMAND_END, &time_end);
328   cl_ulong time_ns = time_end - time_start;
329   return static_cast<double>(time_ns) * 1e-6;
330 }
331 
GenerateLocalByGlobal(size_t global_i)332 std::set<size_t> OpenCLKernel::GenerateLocalByGlobal(size_t global_i) {
333   std::set<size_t> local_ = {};
334   int index = 1;
335   while (index <= global_i) {
336     local_.insert(index);
337     index *= 2;
338   }
339   for (size_t i = 1; i <= 16; i++) {
340     if (global_i % i == 0) {
341       local_.insert(i);
342     }
343   }
344   return local_;
345 }
346 
CheckSpecs()347 int OpenCLKernel::CheckSpecs() {
348   if (out_mem_type_ == lite::opencl::MemType::IMG) {
349     if (!GpuTensorInfo(out_tensors_[0]).IsImageSizeValid()) {
350       return RET_ERROR;
351     }
352   }
353   if (in_tensors_.size() > 0) {
354     if (in_tensors_[0]->data_type() != kNumberTypeFloat32 && in_tensors_[0]->data_type() != kNumberTypeFloat16 &&
355         in_tensors_[0]->data_type() != kNumberTypeInt32 && in_tensors_[0]->data_type() != kNumberTypeInt8) {
356       MS_LOG(WARNING) << "Unsupported data type: " << in_tensors_[0]->data_type();
357       return RET_ERROR;
358     }
359   }
360   return RET_OK;
361 }
362 }  // namespace mindspore::kernel
363