1 /**
2 * Copyright 2020 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "src/runtime/infer_manager.h"
18 #include "src/runtime/kernel/opencl/opencl_kernel.h"
19 #include "src/weight_decoder.h"
20 #include "src/common/file_utils.h"
21
22 using mindspore::lite::RET_ERROR;
23 using mindspore::lite::RET_OK;
24 using mindspore::lite::opencl::ImageSize;
25
26 namespace mindspore::kernel {
AlignGlobalLocal(const std::vector<size_t> & global,const std::vector<size_t> & local)27 void OpenCLKernel::AlignGlobalLocal(const std::vector<size_t> &global, const std::vector<size_t> &local) {
28 std::vector<size_t> internal_global_ws = global;
29 for (size_t i = 0; i < local.size(); ++i) {
30 internal_global_ws.at(i) = UP_ROUND(global.at(i), local.at(i));
31 }
32
33 MS_LOG(DEBUG) << "global size: " << global.size() << ", local size: " << local.size();
34 for (size_t i = 0; i < global.size(); i++) {
35 MS_LOG(DEBUG) << "global[" << i << "] = " << global.at(i);
36 }
37 for (size_t i = 0; i < local.size(); i++) {
38 MS_LOG(DEBUG) << "local[" << i << "] = " << local.at(i);
39 }
40 if (local.empty()) {
41 local_range_ = cl::NullRange;
42 }
43 if (global.size() == 1) {
44 global_range_ = cl::NDRange(internal_global_ws.at(0));
45 if (!local.empty()) {
46 local_range_ = cl::NDRange(local.at(0));
47 }
48 } else if (global.size() == 2) {
49 global_range_ = cl::NDRange(internal_global_ws.at(0), internal_global_ws.at(1));
50 if (!local.empty()) {
51 local_range_ = cl::NDRange(local.at(0), local.at(1));
52 }
53 } else if (global.size() >= 3) {
54 global_range_ = cl::NDRange(internal_global_ws.at(0), internal_global_ws.at(1), internal_global_ws.at(2));
55 if (!local.empty()) {
56 local_range_ = cl::NDRange(local.at(0), local.at(1), local.at(2));
57 }
58 }
59 }
60
GetImageSize(size_t idx,lite::opencl::ImageSize * img_size)61 int OpenCLKernel::GetImageSize(size_t idx, lite::opencl::ImageSize *img_size) {
62 MS_ASSERT(img_size);
63 if (idx >= out_tensors_.size()) {
64 return RET_ERROR;
65 }
66 auto img_info = GpuTensorInfo(out_tensors_[idx]);
67 size_t img_dtype = CL_FLOAT;
68 switch (out_tensors_[idx]->data_type()) {
69 case kNumberTypeFloat32: {
70 img_dtype = CL_FLOAT;
71 break;
72 }
73 case kNumberTypeInt32: {
74 img_dtype = CL_SIGNED_INT32;
75 break;
76 }
77 case kNumberTypeFloat16: {
78 img_dtype = CL_HALF_FLOAT;
79 break;
80 }
81 case kNumberTypeInt8: {
82 img_dtype = CL_SIGNED_INT8;
83 break;
84 }
85 default: {
86 MS_LOG(WARNING) << "Unsupported data_type " << out_tensors_[idx]->data_type();
87 return RET_ERROR;
88 }
89 }
90 *img_size = {img_info.width, img_info.height, img_dtype};
91 return RET_OK;
92 }
93
PrintOutput(int print_num,const std::string & out_file)94 void OpenCLKernel::PrintOutput(int print_num, const std::string &out_file) {
95 printf("%-30s ", name().c_str());
96 if (out_tensors().empty()) {
97 return;
98 }
99 auto *tensor = out_tensors()[0];
100 auto mem_type = GetMemType();
101 if (tensor == nullptr || tensor->data() == nullptr) {
102 return;
103 }
104
105 GpuTensorInfo img_info(tensor);
106 auto size = mem_type == lite::opencl::MemType::BUF ? img_info.OriginSize : img_info.Image2DSize;
107 std::vector<char> data(size);
108 auto runtime_wrapper = lite::opencl::OpenCLRuntimeInnerWrapper();
109 auto runtime = runtime_wrapper.GetInstance();
110 auto allocator = runtime->GetAllocator();
111 if (!runtime->SyncCommandQueue()) {
112 MS_LOG(ERROR) << "SyncCommandQueue failed.";
113 }
114 if (mem_type == lite::opencl::MemType::BUF) {
115 if (allocator->MapBuffer(tensor->data(), CL_MAP_READ, nullptr, true) == nullptr) {
116 MS_LOG(ERROR) << "Map Buffer failed.";
117 }
118 memcpy(data.data(), tensor->data(), img_info.OriginSize);
119 if (allocator->UnmapBuffer(tensor->data()) != RET_OK) {
120 MS_LOG(ERROR) << "UnmapBuffer failed.";
121 }
122 } else {
123 runtime->ReadImage(tensor->data(), data.data());
124 }
125
126 printf("shape=(");
127 auto shape = tensor->shape();
128 for (int i = 0; i < shape.size(); ++i) {
129 printf("%4d", shape[i]);
130 if (i + 1 < shape.size()) {
131 printf(",");
132 }
133 }
134 printf(") ");
135
136 auto total_num = mem_type == lite::opencl::MemType::BUF ? img_info.ElementsNum : img_info.ElementsC4Num;
137 for (int i = 0; i < print_num && i < total_num; ++i) {
138 if (tensor->data_type() == kNumberTypeInt32) {
139 printf("%d %7d | ", i, reinterpret_cast<int32_t *>(data.data())[i]);
140 } else if (tensor->data_type() == kNumberTypeFloat16) {
141 printf("%d %7.3f | ", i, reinterpret_cast<float16_t *>(data.data())[i]);
142 } else if (tensor->data_type() == kNumberTypeFloat32) {
143 printf("%d %7.3f | ", i, reinterpret_cast<float *>(data.data())[i]);
144 } else if (tensor->data_type() == kNumberTypeInt8) {
145 printf("%d %7d | ", i, static_cast<int>(reinterpret_cast<int8_t *>(data.data())[i]));
146 }
147 }
148 printf("\n");
149
150 if (!out_file.empty()) {
151 (void)lite::WriteToBin(out_file, data.data(), data.size());
152 }
153 }
154
PreProcess()155 int OpenCLKernel::PreProcess() {
156 int ret = RET_OK;
157 ret = ReSize();
158 if (ret != RET_OK) {
159 return ret;
160 }
161 for (auto i = 0; i < out_tensors_.size(); ++i) {
162 auto *output = out_tensors_.at(i);
163 CHECK_NULL_RETURN(output);
164 CHECK_NULL_RETURN(output->allocator());
165 if (GetMemType() == lite::opencl::MemType::IMG) {
166 ImageSize img_size;
167 ret = GetImageSize(i, &img_size);
168 if (ret != RET_OK) {
169 MS_LOG(ERROR) << "GetImageSize failed";
170 return ret;
171 }
172 auto data_ptr =
173 output->allocator()->Malloc(img_size.width, img_size.height, static_cast<enum DataType>(output->data_type()));
174 if (data_ptr == nullptr) {
175 MS_LOG(ERROR) << "Malloc data failed";
176 return RET_ERROR;
177 }
178 output->set_data(data_ptr);
179 } else {
180 ret = output->MallocData();
181 if (ret != RET_OK) {
182 MS_LOG(ERROR) << "MallocData failed";
183 return ret;
184 }
185 }
186 output->ResetRefCount();
187 }
188 return RET_OK;
189 }
190
InferShape()191 int OpenCLKernel::InferShape() {
192 if (InferShapeDone()) {
193 return RET_OK;
194 }
195 auto ret = lite::KernelInferShape(in_tensors_, out_tensors_, op_parameter_);
196 if (ret != RET_OK) {
197 MS_LOG(ERROR) << "InferShape failed, type: "
198 << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(type()));
199 return ret;
200 }
201 return RET_OK;
202 }
203
ReSize()204 int OpenCLKernel::ReSize() {
205 if (InferShapeDone()) {
206 return RET_OK;
207 }
208 auto ret = InferShape();
209 if (ret != RET_OK) {
210 return ret;
211 }
212 ret = CheckSpecs();
213 if (ret != RET_OK) {
214 MS_LOG(ERROR) << "ReSize failed for check kernel specs!";
215 return ret;
216 }
217 ret = Prepare();
218 if (ret != RET_OK) {
219 MS_LOG(ERROR) << "ReSize failed for kernel prepare!";
220 return ret;
221 }
222 return RET_OK;
223 }
224
GenerateTuningParam()225 std::vector<BaseTuningParameter> OpenCLKernel::GenerateTuningParam() {
226 size_t ndim = global_size_.size();
227 std::vector<BaseTuningParameter> tuning_params = {};
228 if (ndim == 0) {
229 MS_LOG(ERROR) << "Generate tuning param failed, global_size_ is null.";
230 return tuning_params;
231 }
232 BaseTuningParameter default_tuning_param = BaseTuningParameter();
233 default_tuning_param.local_size = local_size_;
234 tuning_params.push_back(default_tuning_param);
235 std::vector<size_t> max_work_items = ocl_runtime_->GetWorkItemSize();
236 size_t max_workgroup_size = ocl_runtime_->GetMaxWorkGroupSize(kernel_);
237 const size_t MIN_WORKGROUP_SIZE = 8;
238 std::set<size_t> candidate_x = GenerateLocalByGlobal(global_size_[0]);
239 std::set<size_t> candidate_y = {1};
240 std::set<size_t> candidate_z = {1};
241 if (ndim > 1) {
242 candidate_y = GenerateLocalByGlobal(global_size_[1]);
243 }
244 if (ndim > 2) {
245 candidate_z = GenerateLocalByGlobal(global_size_[2]);
246 }
247 for (auto x : candidate_x) {
248 if (x <= max_work_items[0]) {
249 for (auto y : candidate_y) {
250 if (y <= max_work_items[1]) {
251 for (auto z : candidate_z) {
252 auto group_size = x * y * z;
253 if (z <= max_work_items[2] && group_size <= max_workgroup_size && group_size >= MIN_WORKGROUP_SIZE) {
254 BaseTuningParameter tuning_param = BaseTuningParameter();
255 tuning_param.local_size = {x, y, z};
256 tuning_params.push_back(tuning_param);
257 }
258 }
259 }
260 }
261 }
262 }
263 return tuning_params;
264 }
265
AssignTuningParam(const BaseTuningParameter & param)266 int OpenCLKernel::AssignTuningParam(const BaseTuningParameter ¶m) {
267 std::vector<size_t> local_size_tmp = param.local_size;
268 if (local_size_tmp.size() > global_size_.size()) {
269 local_size_tmp = std::vector<size_t>(local_size_tmp.begin(), local_size_tmp.begin() + global_size_.size());
270 }
271 AlignGlobalLocal(global_size_, local_size_tmp);
272 return RET_OK;
273 }
274
Tune()275 int OpenCLKernel::Tune() {
276 if (!ocl_runtime_->isProfiling()) {
277 MS_LOG(WARNING) << "Tuning mode require opencl runtime profiling.";
278 return RET_OK;
279 }
280 lite::opencl::TuningMode mode = ocl_runtime_->GetTuningMode();
281 if (mode == lite::opencl::TuningMode::DEFAULT) {
282 return RET_OK;
283 }
284 static const std::set<int> FAST_MODE_OPS = {schema::PrimitiveType_Conv2DFusion,
285 schema::PrimitiveType_Conv2dTransposeFusion};
286 if (mode == lite::opencl::TuningMode::FAST && FAST_MODE_OPS.find(op_parameter_->type_) == FAST_MODE_OPS.end()) {
287 return RET_OK;
288 }
289 auto tuning_params = GenerateTuningParam();
290 if (tuning_params.empty()) {
291 MS_LOG(WARNING) << "Tuning param size is 0.";
292 return RET_OK;
293 }
294 int index = -1;
295 double min_time = MAX_PROFILING_TIME_MILLI_SECOND;
296 for (int i = 0; i < tuning_params.size(); i++) {
297 AssignTuningParam(tuning_params[i]);
298 auto ret = Run();
299 if (ret != RET_OK) {
300 MS_LOG(ERROR) << "Tuning " << name() << " failed for tuning param " << tuning_params[i];
301 return ret;
302 }
303 double current_time = GetProfilingTimeMs();
304 MS_LOG(DEBUG) << "Tuning " << name() << " param (" << tuning_params[i] << ") exectime " << current_time << "ms";
305 if (current_time < min_time) {
306 min_time = current_time;
307 index = i;
308 }
309 }
310 if (index != -1) {
311 MS_LOG(INFO) << "Tuning " << name() << " result: param (" << tuning_params[index] << ") exectime " << min_time
312 << "ms";
313 AssignTuningParam(tuning_params[index]);
314 } else {
315 MS_LOG(WARNING) << "Cannot find suitable param.";
316 }
317 return RET_OK;
318 }
319
GetProfilingTimeMs()320 double OpenCLKernel::GetProfilingTimeMs() {
321 if (!ocl_runtime_->isProfiling()) {
322 return MAX_PROFILING_TIME_MILLI_SECOND;
323 }
324 cl_ulong time_start;
325 cl_ulong time_end;
326 event_.getProfilingInfo(CL_PROFILING_COMMAND_START, &time_start);
327 event_.getProfilingInfo(CL_PROFILING_COMMAND_END, &time_end);
328 cl_ulong time_ns = time_end - time_start;
329 return static_cast<double>(time_ns) * 1e-6;
330 }
331
GenerateLocalByGlobal(size_t global_i)332 std::set<size_t> OpenCLKernel::GenerateLocalByGlobal(size_t global_i) {
333 std::set<size_t> local_ = {};
334 int index = 1;
335 while (index <= global_i) {
336 local_.insert(index);
337 index *= 2;
338 }
339 for (size_t i = 1; i <= 16; i++) {
340 if (global_i % i == 0) {
341 local_.insert(i);
342 }
343 }
344 return local_;
345 }
346
CheckSpecs()347 int OpenCLKernel::CheckSpecs() {
348 if (out_mem_type_ == lite::opencl::MemType::IMG) {
349 if (!GpuTensorInfo(out_tensors_[0]).IsImageSizeValid()) {
350 return RET_ERROR;
351 }
352 }
353 if (in_tensors_.size() > 0) {
354 if (in_tensors_[0]->data_type() != kNumberTypeFloat32 && in_tensors_[0]->data_type() != kNumberTypeFloat16 &&
355 in_tensors_[0]->data_type() != kNumberTypeInt32 && in_tensors_[0]->data_type() != kNumberTypeInt8) {
356 MS_LOG(WARNING) << "Unsupported data type: " << in_tensors_[0]->data_type();
357 return RET_ERROR;
358 }
359 }
360 return RET_OK;
361 }
362 } // namespace mindspore::kernel
363