1 /**
2 * Copyright 2020 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "src/runtime/kernel/opencl/opencl_subgraph.h"
18 #include <set>
19 #include <map>
20 #include <memory>
21 #include <string>
22 #include <utility>
23 #include "src/runtime/gpu/opencl/opencl_executor.h"
24 #include "src/runtime/kernel/opencl/utils.h"
25 #include "src/runtime/kernel/opencl/kernel/to_format.h"
26 #include "include/errorcode.h"
27 #include "src/common/utils.h"
28 #include "src/common/prim_inner.h"
29
30 namespace mindspore::kernel {
31 using mindspore::lite::PRIM_TO_FORMAT;
32 using mindspore::lite::RET_ERROR;
33 using mindspore::lite::RET_OK;
34 using mindspore::lite::opencl::MemType;
35
~OpenCLSubGraph()36 OpenCLSubGraph::~OpenCLSubGraph() { UnInit(); }
37
ReplaceOutTensorAndKernelToConvert(const lite::Tensor * in_tensor,const std::vector<kernel::LiteKernel * > & in_kernels,lite::Tensor * new_tensor,kernel::LiteKernel * in_convert_op,MemType mem_type)38 void OpenCLSubGraph::ReplaceOutTensorAndKernelToConvert(const lite::Tensor *in_tensor,
39 const std::vector<kernel::LiteKernel *> &in_kernels,
40 lite::Tensor *new_tensor, kernel::LiteKernel *in_convert_op,
41 MemType mem_type) {
42 MS_ASSERT(in_convert_op);
43 auto in_opencl_op = in_convert_op;
44 for (auto &iv : in_kernels) {
45 MS_ASSERT(iv);
46 auto kernels = (mem_type == MemType::IMG) ? iv->in_kernels() : iv->out_kernels();
47 auto fk = std::find_if(kernels.begin(), kernels.end(), [&](kernel::LiteKernel *kv) { return kv == iv; });
48 if (fk != kernels.end()) {
49 *fk = in_convert_op;
50 } else {
51 kernels.emplace_back(in_convert_op);
52 }
53 auto tensors = (mem_type == MemType::IMG) ? iv->in_tensors() : iv->out_tensors();
54 auto ft = std::find_if(tensors.begin(), tensors.end(), [&](lite::Tensor *kv) { return kv == in_tensor; });
55 if (ft != tensors.end()) {
56 *ft = new_tensor;
57 } else {
58 tensors.emplace_back(new_tensor);
59 }
60 if (mem_type == MemType::IMG) {
61 iv->set_in_kernels(kernels);
62 iv->set_in_tensors(tensors);
63 in_opencl_op->AddOutKernel(iv);
64 } else {
65 iv->set_out_kernels(kernels);
66 iv->set_out_tensors(tensors);
67 in_convert_op->AddInKernel(iv);
68 }
69 }
70 }
71
GenToFormatOp(const std::vector<lite::Tensor * > & in_tensors,const std::vector<std::vector<kernel::LiteKernel * >> & in_kernels,std::vector<lite::Tensor * > * out_tensors,std::vector<OpenCLToFormatParameter * > * out_parameters,std::vector<LiteKernel * > * out_convert_ops,MemType mem_type)72 int OpenCLSubGraph::GenToFormatOp(const std::vector<lite::Tensor *> &in_tensors,
73 const std::vector<std::vector<kernel::LiteKernel *>> &in_kernels,
74 std::vector<lite::Tensor *> *out_tensors,
75 std::vector<OpenCLToFormatParameter *> *out_parameters,
76 std::vector<LiteKernel *> *out_convert_ops, MemType mem_type) {
77 MS_ASSERT(out_tensors);
78 MS_ASSERT(out_parameters);
79 MS_ASSERT(out_convert_ops);
80 out_tensors->clear();
81 out_parameters->clear();
82 out_convert_ops->clear();
83 std::vector<std::vector<kernel::LiteKernel *>> loop_kernels;
84 if (mem_type == MemType::BUF) {
85 GetKernelFromToTensor(in_tensors, nodes_, &loop_kernels, true);
86 }
87
88 for (size_t i = 0; i < in_tensors.size(); ++i) {
89 auto *in_tensor = in_tensors.at(i);
90 auto *new_tensor = new (std::nothrow)
91 lite::Tensor(in_tensor->data_type(), in_tensor->shape(), in_tensor->format(), lite::Tensor::VAR);
92 MS_ASSERT(new_tensor);
93 if (new_tensor == nullptr) {
94 MS_LOG(ERROR) << "OpenCLSubGraph new tensor failed!";
95 return RET_ERROR;
96 }
97 for (const auto ¶m : in_tensor->quant_params()) {
98 new_tensor->AddQuantParam(param);
99 }
100
101 out_tensors->emplace_back(new_tensor);
102 KernelKey desc{kGPU, kNumberTypeFloat32, PRIM_TO_FORMAT};
103 auto *parameter = static_cast<OpenCLToFormatParameter *>(malloc(sizeof(OpenCLToFormatParameter)));
104 MS_ASSERT(parameter);
105 if (parameter == nullptr) {
106 MS_LOG(ERROR) << "OpenCLSubGraph new parameter failed!";
107 delete new_tensor;
108 new_tensor = nullptr;
109 return RET_ERROR;
110 }
111
112 parameter->op_parameter.is_zero_shape_ = false;
113 parameter->op_parameter.type_ = PRIM_TO_FORMAT;
114 parameter->out_mem_type = mem_type;
115 out_parameters->emplace_back(parameter);
116 InnerKernel *in_convert_op_inner = nullptr;
117 if (mem_type == MemType::IMG) {
118 in_convert_op_inner = OpenCLKernelCreator<ToFormatOpenCLKernel>(
119 {in_tensor}, {new_tensor}, reinterpret_cast<OpParameter *>(parameter), this->Context(), desc);
120 } else {
121 in_convert_op_inner = OpenCLKernelCreator<ToFormatOpenCLKernel>(
122 {new_tensor}, {in_tensor}, reinterpret_cast<OpParameter *>(parameter), this->Context(), desc);
123 }
124 MS_ASSERT(in_convert_op_inner);
125 if (in_convert_op_inner == nullptr ||
126 reinterpret_cast<ToFormatOpenCLKernel *>(in_convert_op_inner)->CheckSpecs() != RET_OK) {
127 MS_LOG(ERROR) << "OpenCLSubGraph create op failed!";
128 delete new_tensor;
129 new_tensor = nullptr;
130 free(parameter);
131 parameter = nullptr;
132 return RET_ERROR;
133 }
134 std::shared_ptr<kernel::Kernel> inner_convert_op(in_convert_op_inner);
135 auto *in_convert_op = new (std::nothrow) kernel::LiteKernel(inner_convert_op);
136 if (in_convert_op == nullptr) {
137 MS_LOG(ERROR) << "OpenCLSubGraph create op failed!";
138 delete new_tensor;
139 new_tensor = nullptr;
140 free(parameter);
141 parameter = nullptr;
142 return RET_ERROR;
143 }
144 static int index = 0;
145 in_convert_op->set_name("ToFormat_" + std::to_string(index++));
146
147 ReplaceOutTensorAndKernelToConvert(in_tensor, in_kernels.at(i), new_tensor, in_convert_op, mem_type);
148
149 // replace in_tensor of inner kernel which use out tensor
150 if (mem_type == MemType::BUF) {
151 for (auto &iv : loop_kernels[i]) {
152 MS_ASSERT(iv);
153 auto tensors = iv->in_tensors();
154 auto jv = std::find(tensors.begin(), tensors.end(), in_tensors.at(i));
155 if (jv != tensors.end()) {
156 *jv = new_tensor;
157 iv->set_in_tensors(tensors);
158 }
159 }
160 }
161
162 out_convert_ops->emplace_back(in_convert_op);
163 }
164 return RET_OK;
165 }
166
InsertOpsPass()167 int OpenCLSubGraph::InsertOpsPass() {
168 GetInOutNodes();
169
170 std::vector<std::vector<kernel::LiteKernel *>> from_kernels_;
171 GetKernelFromToTensor(in_tensors(), in_nodes_, &from_kernels_, true);
172 int ret =
173 GenToFormatOp(in_tensors(), from_kernels_, &in_convert_tensors_, &in_parameters_, &in_convert_ops_, MemType::IMG);
174 if (ret != RET_OK) {
175 return ret;
176 }
177 nodes_.insert(nodes_.begin(), in_convert_ops_.begin(), in_convert_ops_.end());
178
179 std::vector<std::vector<kernel::LiteKernel *>> to_kernels_;
180 GetKernelFromToTensor(out_tensors(), out_nodes_, &to_kernels_, false);
181 ret =
182 GenToFormatOp(out_tensors(), to_kernels_, &out_convert_tensors_, &out_parameters_, &out_convert_ops_, MemType::BUF);
183 if (ret != RET_OK) {
184 return ret;
185 }
186 nodes_.insert(nodes_.end(), out_convert_ops_.begin(), out_convert_ops_.end());
187 GetInOutNodes();
188 return RET_OK;
189 }
190
Init()191 int OpenCLSubGraph::Init() {
192 // The fp16 operator in heterogeneous scenes needs to be set to fp32
193 // to prevent the frame from being converted to fp16 in advance.
194 if (in_tensors()[0]->data_type() == kNumberTypeFloat32 || in_tensors()[0]->data_type() == kNumberTypeFloat16) {
195 desc_.data_type = in_tensors()[0]->data_type();
196 }
197 allocator_ = ocl_runtime_->GetAllocator();
198 MS_LOG(DEBUG) << "input num=" << in_tensors().size() << ", output num=" << out_tensors().size();
199 for (const auto tensor : in_tensors()) {
200 MS_ASSERT(tensor);
201 tensor->set_allocator(allocator_);
202 }
203 for (const auto tensor : out_tensors()) {
204 MS_ASSERT(tensor);
205 tensor->set_allocator(allocator_);
206 }
207 std::vector<std::pair<std::string, std::function<int(void)>>> pass_manager{
208 {"FusionPass", std::bind(&OpenCLSubGraph::FusionPass, this)},
209 {"InsertOpsPass", std::bind(&OpenCLSubGraph::InsertOpsPass, this)},
210 {"UpdateTensorDataTypePass", std::bind(&OpenCLSubGraph::UpdateTensorDataTypePass, this)},
211 };
212 for (auto iv : pass_manager) {
213 auto ret = iv.second();
214 if (ret != RET_OK) {
215 MS_LOG(ERROR) << "Run Pass: " << iv.first << " failed.";
216 return RET_ERROR;
217 }
218 }
219 return RET_OK;
220 }
221
UpdateTensorDataTypePass()222 int OpenCLSubGraph::UpdateTensorDataTypePass() {
223 bool is_fp16 = ocl_runtime_->GetFp16Enable();
224 if (is_fp16 && subgraph_type() == kGpuFp16SubGraph) {
225 std::set<lite::Tensor *> out_set;
226 auto in_tensors = this->in_tensors();
227 auto out_tensors = this->out_tensors();
228 out_set.insert(in_tensors.begin(), in_tensors.end());
229 out_set.insert(out_tensors.begin(), out_tensors.end());
230 for (auto iv : nodes_) {
231 MS_ASSERT(iv);
232 auto cur_outs = iv->out_tensors();
233 // if softmax is last kernel, output fp32 tensor
234 if (iv->type() == schema::PrimitiveType_Softmax) {
235 bool last_kernel = true;
236 for (auto k : iv->out_kernels()) {
237 int type = k->op_parameter() == nullptr ? k->type() : k->op_parameter()->type_;
238 if (type == lite::PRIM_TO_FORMAT) {
239 last_kernel = false;
240 break;
241 }
242 }
243 if (last_kernel) continue;
244 }
245 for (auto jv : cur_outs) {
246 if (out_set.count(jv) == 0) {
247 MS_ASSERT(jv);
248 // if Fp16Enable, only change fp32 to fp16, other dtype is reserved
249 if (jv->data_type() == kNumberTypeFloat32) {
250 jv->set_data_type(kNumberTypeFloat16);
251 }
252 }
253 }
254 }
255 }
256 return RET_OK;
257 }
258
GetKernelFromToTensor(const std::vector<lite::Tensor * > & in_tensors,const std::vector<kernel::LiteKernel * > & in_kernels,std::vector<std::vector<kernel::LiteKernel * >> * out_kernels,bool is_from)259 void OpenCLSubGraph::GetKernelFromToTensor(const std::vector<lite::Tensor *> &in_tensors,
260 const std::vector<kernel::LiteKernel *> &in_kernels,
261 std::vector<std::vector<kernel::LiteKernel *>> *out_kernels, bool is_from) {
262 std::vector<std::set<lite::Tensor *>> ksets;
263 for (auto jv : in_kernels) {
264 MS_ASSERT(jv);
265 auto tens = is_from ? jv->in_tensors() : jv->out_tensors();
266 std::set<lite::Tensor *> kset;
267 kset.insert(tens.begin(), tens.end());
268 ksets.emplace_back(kset);
269 }
270 MS_ASSERT(out_kernels);
271 for (auto in_tensor : in_tensors) {
272 std::vector<kernel::LiteKernel *> kvec;
273 for (size_t j = 0; j < in_kernels.size(); ++j) {
274 if (ksets[j].count(in_tensor)) {
275 kvec.emplace_back(in_kernels[j]);
276 }
277 }
278 out_kernels->emplace_back(kvec);
279 }
280 }
281
GetInOutNodes()282 void OpenCLSubGraph::GetInOutNodes() {
283 this->in_nodes_.clear();
284 this->out_nodes_.clear();
285 auto in_tensors = this->in_tensors();
286 auto out_tensors = this->out_tensors();
287 for (auto *node : nodes_) {
288 for (auto *tensor : node->in_tensors()) {
289 if (std::find(in_tensors.begin(), in_tensors.end(), tensor) != in_tensors.end()) {
290 in_nodes_.emplace_back(node);
291 break;
292 }
293 }
294 for (auto *tensor : node->out_tensors()) {
295 if (std::find(out_tensors.begin(), out_tensors.end(), tensor) != out_tensors.end()) {
296 out_nodes_.emplace_back(node);
297 break;
298 }
299 }
300 }
301 }
302
Prepare()303 int OpenCLSubGraph::Prepare() {
304 ocl_runtime_->SetFp16Enable(subgraph_type() == kGpuFp16SubGraph);
305 for (const auto tensor : in_tensors()) {
306 MS_ASSERT(tensor);
307 tensor->set_allocator(allocator_);
308 }
309 for (const auto tensor : out_tensors()) {
310 MS_ASSERT(tensor);
311 tensor->set_allocator(allocator_);
312 }
313 executor_ = new (std::nothrow) lite::opencl::OpenCLExecutor();
314 if (executor_ == nullptr) {
315 MS_LOG(ERROR) << "Create OpenCLExecutor fail";
316 return RET_ERROR;
317 }
318 for (auto node : this->nodes_) {
319 if (node == nullptr) {
320 MS_LOG(ERROR) << "node in Subgraph is nullptr";
321 return mindspore::lite::RET_NULL_PTR;
322 }
323 for (const auto tensor : node->out_tensors()) {
324 CHECK_NULL_RETURN(tensor);
325 MS_CHECK_TRUE_RET(tensor->data() == nullptr, RET_ERROR);
326 tensor->set_allocator(allocator_);
327 }
328 if (desc_.provider == kBuiltin) {
329 auto opencl_kernel = reinterpret_cast<kernel::OpenCLKernel *>(node->kernel());
330 std::set<int> pre_init_weight_list = {schema::PrimitiveType_MatMulFusion, schema::PrimitiveType_BiasAdd};
331 if (pre_init_weight_list.find(opencl_kernel->type()) != pre_init_weight_list.end()) {
332 auto ret = opencl_kernel->InitWeights();
333 if (ret != RET_OK) {
334 MS_LOG(ERROR) << "init weights " << node->name() << " failed";
335 return ret;
336 }
337 }
338 }
339 if (node->InferShapeDone()) {
340 auto ret = node->Prepare();
341 if (ret != RET_OK) {
342 MS_LOG(ERROR) << "prepare node " << node->name() << " failed";
343 return ret;
344 }
345 }
346 }
347 if (all_kernels_infer_done_) {
348 auto opencl_exec = reinterpret_cast<lite::opencl::OpenCLExecutor *>(executor_);
349 // If tuning_mode is DEFAULT, just malloc memory for reuse.
350 auto ret = opencl_exec->RunOrTune(in_tensors(), out_tensors(), nodes_, nullptr, nullptr, true);
351 if (ret != RET_OK) {
352 MS_LOG(ERROR) << "Run opencl Tuning failed: " << ret;
353 return ret;
354 }
355 }
356 return RET_OK;
357 }
358
UnInit()359 void OpenCLSubGraph::UnInit() {
360 for (const auto &tensor : in_convert_tensors_) {
361 delete tensor;
362 }
363 in_convert_tensors_.clear();
364 for (const auto &tensor : out_convert_tensors_) {
365 delete tensor;
366 }
367 out_convert_tensors_.clear();
368 for (const auto &op : nodes_) {
369 delete op;
370 }
371 nodes_.clear();
372 in_convert_ops_.clear();
373 out_convert_ops_.clear();
374 delete this->executor_;
375 }
376
ReSize()377 int OpenCLSubGraph::ReSize() { return ReSize(false); }
378
ReSize(bool interrupt)379 int OpenCLSubGraph::ReSize(bool interrupt) {
380 for (auto kernel : nodes_) {
381 if (kernel == nullptr) {
382 MS_LOG(ERROR) << "input kernel is nullptr!";
383 return RET_ERROR;
384 }
385 if (kernel->subgraph_type() != kernel::kNotSubGraph) {
386 MS_LOG(ERROR) << "all nodes in should be kernel";
387 return RET_ERROR;
388 }
389 std::vector<lite::Tensor *> outputs = kernel->out_tensors();
390 for (auto &output : outputs) {
391 output->FreeData();
392 output->set_shape({-1});
393 }
394 }
395 for (auto kernel : nodes_) {
396 auto ret = kernel->ReSize();
397 if (ret != RET_OK) {
398 MS_LOG(WARNING) << "ReSize " << kernel->name() << "failed!";
399 if (interrupt) {
400 return ret;
401 } else {
402 break;
403 }
404 }
405 }
406 return RET_OK;
407 }
408
Execute()409 int OpenCLSubGraph::Execute() {
410 if (executor_ == nullptr) {
411 MS_LOG(ERROR) << "executor is nullptr";
412 return RET_ERROR;
413 }
414 int ret;
415 for (auto &tensor : in_tensors()) {
416 MS_ASSERT(tensor);
417 if (tensor->data() == nullptr) {
418 MS_LOG(ERROR) << "OpenCL subgraph input tensor data is null";
419 return RET_ERROR;
420 }
421 ret = allocator_->UnmapBuffer(tensor->data());
422 if (ret != RET_OK) {
423 return ret;
424 }
425 }
426
427 ret = executor_->Run(in_tensors(), out_tensors(), nodes_);
428 if (ret != RET_OK) {
429 MS_LOG(ERROR) << "Run opencl executor failed: " << ret;
430 return ret;
431 }
432 if (!ocl_runtime_->SyncCommandQueue()) {
433 MS_LOG(ERROR) << "SyncCommandQueue failed.";
434 return RET_ERROR;
435 }
436 return RET_OK;
437 }
438
Execute(const KernelCallBack & before,const KernelCallBack & after)439 int OpenCLSubGraph::Execute(const KernelCallBack &before, const KernelCallBack &after) {
440 if (executor_ == nullptr) {
441 MS_LOG(ERROR) << "executor is nullptr";
442 return RET_ERROR;
443 }
444 int ret;
445 for (auto &tensor : in_tensors()) {
446 MS_ASSERT(tensor);
447 if (tensor->data() == nullptr) {
448 MS_LOG(ERROR) << "OpenCL subgraph input tensor data is null";
449 return RET_ERROR;
450 }
451 ret = allocator_->UnmapBuffer(tensor->data());
452 if (ret != RET_OK) {
453 return ret;
454 }
455 }
456
457 ret = executor_->Run(in_tensors(), out_tensors(), nodes_, before, after);
458 if (ret != RET_OK) {
459 MS_LOG(ERROR) << "Run opencl executor failed: " << ret;
460 return ret;
461 }
462 if (!ocl_runtime_->SyncCommandQueue()) {
463 MS_LOG(ERROR) << "SyncCommandQueue failed.";
464 return RET_ERROR;
465 }
466 return RET_OK;
467 }
468 } // namespace mindspore::kernel
469