• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "src/runtime/kernel/opencl/opencl_subgraph.h"
18 #include <set>
19 #include <map>
20 #include <memory>
21 #include <string>
22 #include <utility>
23 #include "src/runtime/gpu/opencl/opencl_executor.h"
24 #include "src/runtime/kernel/opencl/utils.h"
25 #include "src/runtime/kernel/opencl/kernel/to_format.h"
26 #include "include/errorcode.h"
27 #include "src/common/utils.h"
28 #include "src/common/prim_inner.h"
29 
30 namespace mindspore::kernel {
31 using mindspore::lite::PRIM_TO_FORMAT;
32 using mindspore::lite::RET_ERROR;
33 using mindspore::lite::RET_OK;
34 using mindspore::lite::opencl::MemType;
35 
~OpenCLSubGraph()36 OpenCLSubGraph::~OpenCLSubGraph() { UnInit(); }
37 
ReplaceOutTensorAndKernelToConvert(const lite::Tensor * in_tensor,const std::vector<kernel::LiteKernel * > & in_kernels,lite::Tensor * new_tensor,kernel::LiteKernel * in_convert_op,MemType mem_type)38 void OpenCLSubGraph::ReplaceOutTensorAndKernelToConvert(const lite::Tensor *in_tensor,
39                                                         const std::vector<kernel::LiteKernel *> &in_kernels,
40                                                         lite::Tensor *new_tensor, kernel::LiteKernel *in_convert_op,
41                                                         MemType mem_type) {
42   MS_ASSERT(in_convert_op);
43   auto in_opencl_op = in_convert_op;
44   for (auto &iv : in_kernels) {
45     MS_ASSERT(iv);
46     auto kernels = (mem_type == MemType::IMG) ? iv->in_kernels() : iv->out_kernels();
47     auto fk = std::find_if(kernels.begin(), kernels.end(), [&](kernel::LiteKernel *kv) { return kv == iv; });
48     if (fk != kernels.end()) {
49       *fk = in_convert_op;
50     } else {
51       kernels.emplace_back(in_convert_op);
52     }
53     auto tensors = (mem_type == MemType::IMG) ? iv->in_tensors() : iv->out_tensors();
54     auto ft = std::find_if(tensors.begin(), tensors.end(), [&](lite::Tensor *kv) { return kv == in_tensor; });
55     if (ft != tensors.end()) {
56       *ft = new_tensor;
57     } else {
58       tensors.emplace_back(new_tensor);
59     }
60     if (mem_type == MemType::IMG) {
61       iv->set_in_kernels(kernels);
62       iv->set_in_tensors(tensors);
63       in_opencl_op->AddOutKernel(iv);
64     } else {
65       iv->set_out_kernels(kernels);
66       iv->set_out_tensors(tensors);
67       in_convert_op->AddInKernel(iv);
68     }
69   }
70 }
71 
GenToFormatOp(const std::vector<lite::Tensor * > & in_tensors,const std::vector<std::vector<kernel::LiteKernel * >> & in_kernels,std::vector<lite::Tensor * > * out_tensors,std::vector<OpenCLToFormatParameter * > * out_parameters,std::vector<LiteKernel * > * out_convert_ops,MemType mem_type)72 int OpenCLSubGraph::GenToFormatOp(const std::vector<lite::Tensor *> &in_tensors,
73                                   const std::vector<std::vector<kernel::LiteKernel *>> &in_kernels,
74                                   std::vector<lite::Tensor *> *out_tensors,
75                                   std::vector<OpenCLToFormatParameter *> *out_parameters,
76                                   std::vector<LiteKernel *> *out_convert_ops, MemType mem_type) {
77   MS_ASSERT(out_tensors);
78   MS_ASSERT(out_parameters);
79   MS_ASSERT(out_convert_ops);
80   out_tensors->clear();
81   out_parameters->clear();
82   out_convert_ops->clear();
83   std::vector<std::vector<kernel::LiteKernel *>> loop_kernels;
84   if (mem_type == MemType::BUF) {
85     GetKernelFromToTensor(in_tensors, nodes_, &loop_kernels, true);
86   }
87 
88   for (size_t i = 0; i < in_tensors.size(); ++i) {
89     auto *in_tensor = in_tensors.at(i);
90     auto *new_tensor = new (std::nothrow)
91       lite::Tensor(in_tensor->data_type(), in_tensor->shape(), in_tensor->format(), lite::Tensor::VAR);
92     MS_ASSERT(new_tensor);
93     if (new_tensor == nullptr) {
94       MS_LOG(ERROR) << "OpenCLSubGraph new tensor failed!";
95       return RET_ERROR;
96     }
97     for (const auto &param : in_tensor->quant_params()) {
98       new_tensor->AddQuantParam(param);
99     }
100 
101     out_tensors->emplace_back(new_tensor);
102     KernelKey desc{kGPU, kNumberTypeFloat32, PRIM_TO_FORMAT};
103     auto *parameter = static_cast<OpenCLToFormatParameter *>(malloc(sizeof(OpenCLToFormatParameter)));
104     MS_ASSERT(parameter);
105     if (parameter == nullptr) {
106       MS_LOG(ERROR) << "OpenCLSubGraph new parameter failed!";
107       delete new_tensor;
108       new_tensor = nullptr;
109       return RET_ERROR;
110     }
111 
112     parameter->op_parameter.is_zero_shape_ = false;
113     parameter->op_parameter.type_ = PRIM_TO_FORMAT;
114     parameter->out_mem_type = mem_type;
115     out_parameters->emplace_back(parameter);
116     InnerKernel *in_convert_op_inner = nullptr;
117     if (mem_type == MemType::IMG) {
118       in_convert_op_inner = OpenCLKernelCreator<ToFormatOpenCLKernel>(
119         {in_tensor}, {new_tensor}, reinterpret_cast<OpParameter *>(parameter), this->Context(), desc);
120     } else {
121       in_convert_op_inner = OpenCLKernelCreator<ToFormatOpenCLKernel>(
122         {new_tensor}, {in_tensor}, reinterpret_cast<OpParameter *>(parameter), this->Context(), desc);
123     }
124     MS_ASSERT(in_convert_op_inner);
125     if (in_convert_op_inner == nullptr ||
126         reinterpret_cast<ToFormatOpenCLKernel *>(in_convert_op_inner)->CheckSpecs() != RET_OK) {
127       MS_LOG(ERROR) << "OpenCLSubGraph create op failed!";
128       delete new_tensor;
129       new_tensor = nullptr;
130       free(parameter);
131       parameter = nullptr;
132       return RET_ERROR;
133     }
134     std::shared_ptr<kernel::Kernel> inner_convert_op(in_convert_op_inner);
135     auto *in_convert_op = new (std::nothrow) kernel::LiteKernel(inner_convert_op);
136     if (in_convert_op == nullptr) {
137       MS_LOG(ERROR) << "OpenCLSubGraph create op failed!";
138       delete new_tensor;
139       new_tensor = nullptr;
140       free(parameter);
141       parameter = nullptr;
142       return RET_ERROR;
143     }
144     static int index = 0;
145     in_convert_op->set_name("ToFormat_" + std::to_string(index++));
146 
147     ReplaceOutTensorAndKernelToConvert(in_tensor, in_kernels.at(i), new_tensor, in_convert_op, mem_type);
148 
149     // replace in_tensor of inner kernel which use out tensor
150     if (mem_type == MemType::BUF) {
151       for (auto &iv : loop_kernels[i]) {
152         MS_ASSERT(iv);
153         auto tensors = iv->in_tensors();
154         auto jv = std::find(tensors.begin(), tensors.end(), in_tensors.at(i));
155         if (jv != tensors.end()) {
156           *jv = new_tensor;
157           iv->set_in_tensors(tensors);
158         }
159       }
160     }
161 
162     out_convert_ops->emplace_back(in_convert_op);
163   }
164   return RET_OK;
165 }
166 
InsertOpsPass()167 int OpenCLSubGraph::InsertOpsPass() {
168   GetInOutNodes();
169 
170   std::vector<std::vector<kernel::LiteKernel *>> from_kernels_;
171   GetKernelFromToTensor(in_tensors(), in_nodes_, &from_kernels_, true);
172   int ret =
173     GenToFormatOp(in_tensors(), from_kernels_, &in_convert_tensors_, &in_parameters_, &in_convert_ops_, MemType::IMG);
174   if (ret != RET_OK) {
175     return ret;
176   }
177   nodes_.insert(nodes_.begin(), in_convert_ops_.begin(), in_convert_ops_.end());
178 
179   std::vector<std::vector<kernel::LiteKernel *>> to_kernels_;
180   GetKernelFromToTensor(out_tensors(), out_nodes_, &to_kernels_, false);
181   ret =
182     GenToFormatOp(out_tensors(), to_kernels_, &out_convert_tensors_, &out_parameters_, &out_convert_ops_, MemType::BUF);
183   if (ret != RET_OK) {
184     return ret;
185   }
186   nodes_.insert(nodes_.end(), out_convert_ops_.begin(), out_convert_ops_.end());
187   GetInOutNodes();
188   return RET_OK;
189 }
190 
Init()191 int OpenCLSubGraph::Init() {
192   // The fp16 operator in heterogeneous scenes needs to be set to fp32
193   // to prevent the frame from being converted to fp16 in advance.
194   if (in_tensors()[0]->data_type() == kNumberTypeFloat32 || in_tensors()[0]->data_type() == kNumberTypeFloat16) {
195     desc_.data_type = in_tensors()[0]->data_type();
196   }
197   allocator_ = ocl_runtime_->GetAllocator();
198   MS_LOG(DEBUG) << "input num=" << in_tensors().size() << ", output num=" << out_tensors().size();
199   for (const auto tensor : in_tensors()) {
200     MS_ASSERT(tensor);
201     tensor->set_allocator(allocator_);
202   }
203   for (const auto tensor : out_tensors()) {
204     MS_ASSERT(tensor);
205     tensor->set_allocator(allocator_);
206   }
207   std::vector<std::pair<std::string, std::function<int(void)>>> pass_manager{
208     {"FusionPass", std::bind(&OpenCLSubGraph::FusionPass, this)},
209     {"InsertOpsPass", std::bind(&OpenCLSubGraph::InsertOpsPass, this)},
210     {"UpdateTensorDataTypePass", std::bind(&OpenCLSubGraph::UpdateTensorDataTypePass, this)},
211   };
212   for (auto iv : pass_manager) {
213     auto ret = iv.second();
214     if (ret != RET_OK) {
215       MS_LOG(ERROR) << "Run Pass: " << iv.first << " failed.";
216       return RET_ERROR;
217     }
218   }
219   return RET_OK;
220 }
221 
UpdateTensorDataTypePass()222 int OpenCLSubGraph::UpdateTensorDataTypePass() {
223   bool is_fp16 = ocl_runtime_->GetFp16Enable();
224   if (is_fp16 && subgraph_type() == kGpuFp16SubGraph) {
225     std::set<lite::Tensor *> out_set;
226     auto in_tensors = this->in_tensors();
227     auto out_tensors = this->out_tensors();
228     out_set.insert(in_tensors.begin(), in_tensors.end());
229     out_set.insert(out_tensors.begin(), out_tensors.end());
230     for (auto iv : nodes_) {
231       MS_ASSERT(iv);
232       auto cur_outs = iv->out_tensors();
233       // if softmax is last kernel, output fp32 tensor
234       if (iv->type() == schema::PrimitiveType_Softmax) {
235         bool last_kernel = true;
236         for (auto k : iv->out_kernels()) {
237           int type = k->op_parameter() == nullptr ? k->type() : k->op_parameter()->type_;
238           if (type == lite::PRIM_TO_FORMAT) {
239             last_kernel = false;
240             break;
241           }
242         }
243         if (last_kernel) continue;
244       }
245       for (auto jv : cur_outs) {
246         if (out_set.count(jv) == 0) {
247           MS_ASSERT(jv);
248           // if Fp16Enable, only change fp32 to fp16, other dtype is reserved
249           if (jv->data_type() == kNumberTypeFloat32) {
250             jv->set_data_type(kNumberTypeFloat16);
251           }
252         }
253       }
254     }
255   }
256   return RET_OK;
257 }
258 
GetKernelFromToTensor(const std::vector<lite::Tensor * > & in_tensors,const std::vector<kernel::LiteKernel * > & in_kernels,std::vector<std::vector<kernel::LiteKernel * >> * out_kernels,bool is_from)259 void OpenCLSubGraph::GetKernelFromToTensor(const std::vector<lite::Tensor *> &in_tensors,
260                                            const std::vector<kernel::LiteKernel *> &in_kernels,
261                                            std::vector<std::vector<kernel::LiteKernel *>> *out_kernels, bool is_from) {
262   std::vector<std::set<lite::Tensor *>> ksets;
263   for (auto jv : in_kernels) {
264     MS_ASSERT(jv);
265     auto tens = is_from ? jv->in_tensors() : jv->out_tensors();
266     std::set<lite::Tensor *> kset;
267     kset.insert(tens.begin(), tens.end());
268     ksets.emplace_back(kset);
269   }
270   MS_ASSERT(out_kernels);
271   for (auto in_tensor : in_tensors) {
272     std::vector<kernel::LiteKernel *> kvec;
273     for (size_t j = 0; j < in_kernels.size(); ++j) {
274       if (ksets[j].count(in_tensor)) {
275         kvec.emplace_back(in_kernels[j]);
276       }
277     }
278     out_kernels->emplace_back(kvec);
279   }
280 }
281 
GetInOutNodes()282 void OpenCLSubGraph::GetInOutNodes() {
283   this->in_nodes_.clear();
284   this->out_nodes_.clear();
285   auto in_tensors = this->in_tensors();
286   auto out_tensors = this->out_tensors();
287   for (auto *node : nodes_) {
288     for (auto *tensor : node->in_tensors()) {
289       if (std::find(in_tensors.begin(), in_tensors.end(), tensor) != in_tensors.end()) {
290         in_nodes_.emplace_back(node);
291         break;
292       }
293     }
294     for (auto *tensor : node->out_tensors()) {
295       if (std::find(out_tensors.begin(), out_tensors.end(), tensor) != out_tensors.end()) {
296         out_nodes_.emplace_back(node);
297         break;
298       }
299     }
300   }
301 }
302 
Prepare()303 int OpenCLSubGraph::Prepare() {
304   ocl_runtime_->SetFp16Enable(subgraph_type() == kGpuFp16SubGraph);
305   for (const auto tensor : in_tensors()) {
306     MS_ASSERT(tensor);
307     tensor->set_allocator(allocator_);
308   }
309   for (const auto tensor : out_tensors()) {
310     MS_ASSERT(tensor);
311     tensor->set_allocator(allocator_);
312   }
313   executor_ = new (std::nothrow) lite::opencl::OpenCLExecutor();
314   if (executor_ == nullptr) {
315     MS_LOG(ERROR) << "Create OpenCLExecutor fail";
316     return RET_ERROR;
317   }
318   for (auto node : this->nodes_) {
319     if (node == nullptr) {
320       MS_LOG(ERROR) << "node in Subgraph is nullptr";
321       return mindspore::lite::RET_NULL_PTR;
322     }
323     for (const auto tensor : node->out_tensors()) {
324       CHECK_NULL_RETURN(tensor);
325       MS_CHECK_TRUE_RET(tensor->data() == nullptr, RET_ERROR);
326       tensor->set_allocator(allocator_);
327     }
328     if (desc_.provider == kBuiltin) {
329       auto opencl_kernel = reinterpret_cast<kernel::OpenCLKernel *>(node->kernel());
330       std::set<int> pre_init_weight_list = {schema::PrimitiveType_MatMulFusion, schema::PrimitiveType_BiasAdd};
331       if (pre_init_weight_list.find(opencl_kernel->type()) != pre_init_weight_list.end()) {
332         auto ret = opencl_kernel->InitWeights();
333         if (ret != RET_OK) {
334           MS_LOG(ERROR) << "init weights " << node->name() << " failed";
335           return ret;
336         }
337       }
338     }
339     if (node->InferShapeDone()) {
340       auto ret = node->Prepare();
341       if (ret != RET_OK) {
342         MS_LOG(ERROR) << "prepare node " << node->name() << " failed";
343         return ret;
344       }
345     }
346   }
347   if (all_kernels_infer_done_) {
348     auto opencl_exec = reinterpret_cast<lite::opencl::OpenCLExecutor *>(executor_);
349     // If tuning_mode is DEFAULT, just malloc memory for reuse.
350     auto ret = opencl_exec->RunOrTune(in_tensors(), out_tensors(), nodes_, nullptr, nullptr, true);
351     if (ret != RET_OK) {
352       MS_LOG(ERROR) << "Run opencl Tuning failed: " << ret;
353       return ret;
354     }
355   }
356   return RET_OK;
357 }
358 
UnInit()359 void OpenCLSubGraph::UnInit() {
360   for (const auto &tensor : in_convert_tensors_) {
361     delete tensor;
362   }
363   in_convert_tensors_.clear();
364   for (const auto &tensor : out_convert_tensors_) {
365     delete tensor;
366   }
367   out_convert_tensors_.clear();
368   for (const auto &op : nodes_) {
369     delete op;
370   }
371   nodes_.clear();
372   in_convert_ops_.clear();
373   out_convert_ops_.clear();
374   delete this->executor_;
375 }
376 
ReSize()377 int OpenCLSubGraph::ReSize() { return ReSize(false); }
378 
ReSize(bool interrupt)379 int OpenCLSubGraph::ReSize(bool interrupt) {
380   for (auto kernel : nodes_) {
381     if (kernel == nullptr) {
382       MS_LOG(ERROR) << "input kernel is nullptr!";
383       return RET_ERROR;
384     }
385     if (kernel->subgraph_type() != kernel::kNotSubGraph) {
386       MS_LOG(ERROR) << "all nodes in should be kernel";
387       return RET_ERROR;
388     }
389     std::vector<lite::Tensor *> outputs = kernel->out_tensors();
390     for (auto &output : outputs) {
391       output->FreeData();
392       output->set_shape({-1});
393     }
394   }
395   for (auto kernel : nodes_) {
396     auto ret = kernel->ReSize();
397     if (ret != RET_OK) {
398       MS_LOG(WARNING) << "ReSize " << kernel->name() << "failed!";
399       if (interrupt) {
400         return ret;
401       } else {
402         break;
403       }
404     }
405   }
406   return RET_OK;
407 }
408 
Execute()409 int OpenCLSubGraph::Execute() {
410   if (executor_ == nullptr) {
411     MS_LOG(ERROR) << "executor is nullptr";
412     return RET_ERROR;
413   }
414   int ret;
415   for (auto &tensor : in_tensors()) {
416     MS_ASSERT(tensor);
417     if (tensor->data() == nullptr) {
418       MS_LOG(ERROR) << "OpenCL subgraph input tensor data is null";
419       return RET_ERROR;
420     }
421     ret = allocator_->UnmapBuffer(tensor->data());
422     if (ret != RET_OK) {
423       return ret;
424     }
425   }
426 
427   ret = executor_->Run(in_tensors(), out_tensors(), nodes_);
428   if (ret != RET_OK) {
429     MS_LOG(ERROR) << "Run opencl executor failed: " << ret;
430     return ret;
431   }
432   if (!ocl_runtime_->SyncCommandQueue()) {
433     MS_LOG(ERROR) << "SyncCommandQueue failed.";
434     return RET_ERROR;
435   }
436   return RET_OK;
437 }
438 
Execute(const KernelCallBack & before,const KernelCallBack & after)439 int OpenCLSubGraph::Execute(const KernelCallBack &before, const KernelCallBack &after) {
440   if (executor_ == nullptr) {
441     MS_LOG(ERROR) << "executor is nullptr";
442     return RET_ERROR;
443   }
444   int ret;
445   for (auto &tensor : in_tensors()) {
446     MS_ASSERT(tensor);
447     if (tensor->data() == nullptr) {
448       MS_LOG(ERROR) << "OpenCL subgraph input tensor data is null";
449       return RET_ERROR;
450     }
451     ret = allocator_->UnmapBuffer(tensor->data());
452     if (ret != RET_OK) {
453       return ret;
454     }
455   }
456 
457   ret = executor_->Run(in_tensors(), out_tensors(), nodes_, before, after);
458   if (ret != RET_OK) {
459     MS_LOG(ERROR) << "Run opencl executor failed: " << ret;
460     return ret;
461   }
462   if (!ocl_runtime_->SyncCommandQueue()) {
463     MS_LOG(ERROR) << "SyncCommandQueue failed.";
464     return RET_ERROR;
465   }
466   return RET_OK;
467 }
468 }  // namespace mindspore::kernel
469